## 线性模型

s i g m o i d sigmoid s i g m o i d 是连续可导函数，为我们计算带来了方便。
s i g m o i d sigmoid s i g m o i d 是最简单的满足这样条件的函数，更“自然”，数学上的解释 —— 满足最大似然估计的结果

s i g m o i d sigmoid s i g m o i d 函数，使得预测的结果从样本成绩变成了样本成绩的概率。

## 损失函数

− l o g ( p )      i f      y = 1 -log(p)~~~~if~~~~y=1 − l o g ( p )         i f         y = 1
− l o g ( 1 − p )      i f      y = 0 -log(1-p)~~~~if~~~~y=0 − l o g ( 1 − p )         i f         y = 0

p p p

L ( p , y ) = − y   l o g ( p ) − ( 1 − y )   l o g ( 1 − p ) ) L(p, y)=-y~log(p) – (1-y)~ log(1-p)) L ( p , y ) = − y   l o g ( p ) − ( 1 − y )   l o g ( 1 − p ))

1 m ∑ i = 1 m L ( p i , y i ) \frac{1}{m}\sum_{i=1}^{m}L(p_{i}, y_{i}) ∑ i = 1 m ​ L ( p i ​ , y i ​ )

## 损失函数的梯度

J ( θ ) = − 1 m ∑ m i = 1 y i l o g ( σ ( X b i θ ) ) + ( 1 − y i ) l o g ( 1 − σ ( X b i θ ) ) J(\theta)=-\frac{1}{m}\sum_{m}^{i=1}y_{i}log(\sigma(X^{i}_{b}\theta))+(1-y_{i})log(1-\sigma(X^i_{b}\theta)) J ( θ ) = − ∑ m i = 1 ​ y i ​ l o g ( σ ( X b i ​ θ )) + ( 1 − y i ​ ) l o g ( 1 − σ ( X b i ​ θ ))

σ ( t ) = 1 1 + e − t = ( 1 + e − t ) − 1 \sigma(t)=\frac{1}{1+e^{-t}}=(1+e^{-t})^{-1} σ ( t ) = 1 + e − t 1 ​ = ( 1 + e − t ) − 1

σ ( t ) ′ = − ( 1 + e − t ) − 2 ⋅ e − t ⋅ ( − 1 ) = ( 1 + e − t ) − 2 ⋅ e − t \sigma(t)’=-(1+e^{-t})^{-2}·e^{-t}·(-1)=(1+e^{-t})^{-2}·e^{-t} σ ( t ) ′ = − ( 1 + e − t ) − 2 ⋅ e − t ⋅ ( − 1 ) = ( 1 + e − t ) − 2 ⋅ e − t

l o g   σ ( t ) = 1 σ ( t ) ⋅ σ ( t ) ′ = 1 σ ( t ) ⋅ ( 1 + e − t ) − 2 ⋅ e − t log~\sigma(t)=\frac{1}{\sigma(t)}·\sigma(t)’=\frac{1}{\sigma(t)}·(1+e^{-t})^{-2}·e^{-t} l o g   σ ( t ) = ⋅ σ ( t ) ′ = ⋅ ( 1 + e − t ) − 2 ⋅ e − t
l o g   σ ( t ) = 1 ( 1 + e − t ) − 1 ⋅ ( 1 + e − t ) − 2 ⋅ e − t log~\sigma(t)=\frac{1}{(1+e^{-t})^{-1}}·(1+e^{-t})^{-2}·e^{-t} l o g   σ ( t ) = ( 1 + e − t ) − 1 1 ​ ⋅ ( 1 + e − t ) − 2 ⋅ e − t
l o g   σ ( t ) = ( 1 + e − t ) − 1 ⋅ e − t log~\sigma(t)=(1+e^{-t})^{-1}·e^{-t} l o g   σ ( t ) = ( 1 + e − t ) − 1 ⋅ e − t
l o g   σ ( t ) = e − t 1 + e − t = 1 + e − t − 1 1 + e − t = 1 − 1 1 + e − t = 1 − σ ( t ) log~\sigma(t)=\frac{e^{-t}}{1+e^{-t}}=\frac{1+e^{-t}-1}{1+e^{-t}}=1-\frac{1}{1+e^{-t}}=1-\sigma(t) l o g   σ ( t ) = 1 + e − t e − t ​ = 1 + e − t 1 + e − t − 1 ​ = 1 + e − t 1 ​ = σ ( t )

l o g ( 1 − σ ( t ) ) ′ = 1 1 − σ ( t ) ⋅ ( − 1 ) ⋅ σ ( t ) ′ = − 1 1 − σ ( t ) ⋅ ( 1 + e − t ) − 2 ⋅ e − t log(1-\sigma(t))’=\frac{1}{1-\sigma(t)}·(-1)·\sigma(t)’=-\frac{1}{1-\sigma(t)}·(1+e^{-t})^{-2}·e^{-t} l o g ( 1 − σ ( t ) ) ′ = 1 − σ ( t ) 1 ​ ⋅ ( − 1 ) ⋅ σ ( t ) ′ = − 1 − σ ( t ) 1 ​ ⋅ ( 1 + e − t ) − 2 ⋅ e − t

y i X j i − σ ( X b i θ ) ⋅ X j i = ( y i − σ ( X b i θ ) ) ⋅ X j i y_{i}X^{i}_{j}-\sigma(X^{i}_{b}\theta)·X^{i}_{j}=(y_{i}-\sigma(X^{i}_{b}\theta))·X^{i}_{j} y i ​ X j i ​ − σ ( X b i ​ θ ) ⋅ X j i ​ = ( y i ​ − σ ( X b i ​ θ )) ⋅ X j i ​

J ( θ ) θ j = 1 m ∑ i = 1 m ( σ ( X b i θ ) − y i ) X j i \frac{J(\theta)}{\theta_{j}}=\frac{1}{m}\sum_{i=1}^{m}(\sigma(X^{i}_{b}\theta)-y_{i})X^{i}_{j} θ j ​ J ( θ ) ​ = ∑ i = 1 m ​ ( σ ( X b i ​ θ ) − y i ​ ) X j i ​

J ( θ ) θ j = 1 m ∑ i = 1 m ( y i ^ − y i ) X j i \frac{J(\theta)}{\theta_{j}}=\frac{1}{m}\sum_{i=1}^{m}(\hat{y_{i}}-y_{i})X^{i}_{j} θ j ​ J ( θ ) ​ = ∑ i = 1 m ​ ( y i ​ ^ ​ − y i ​ ) X j i ​

Δ J ( θ ) = { ∂ J ∂ θ 0 ∂ J ∂ θ 1 ⋅ ⋅ ⋅ ∂ J ∂ θ n } = 1 m ⋅ { ∑ i = 1 m σ ( X b i θ ) − y i ∑ i = 1 m ( σ ( X b i θ ) − y i ) ⋅ X 1 i ⋅ ⋅ ⋅ ∑ i = 1 m ( σ ( X b i θ ) − y i ) ⋅ X n i } = 1 m ⋅ X b T ⋅ ( σ ( X b θ ) − y ) \Delta J(\theta)= \begin{Bmatrix} \frac{\partial J}{\partial\theta_{0}} \\ \frac{\partial J}{\partial\theta_{1}} \\ ··· \\ \frac{\partial J}{\partial\theta_{n}} \end{Bmatrix} = \frac{1}{m}·\begin{Bmatrix} \sum_{i=1}^{m} \sigma(X^{i}_{b}\theta)-y_{i}\\ \sum_{i=1}^{m} (\sigma(X^{i}_{b}\theta)-y_{i})·X^{i}_{1} \\ ··· \\ \sum_{i=1}^{m} (\sigma(X^{i}_{b}\theta)-y_{i})·X^{i}_{n} \end{Bmatrix}=\frac{1}{m}·X^{T}_{b}·(\sigma(X_{b}\theta)-y) Δ J ( θ ) = ⎩ ⎨ ⎧ ​ ∂ θ 0 ​ ∂ J ​ ∂ θ 1 ​ ∂ J ​ ⋅⋅⋅ ∂ θ n ​ ∂ J ​ ​ ⎭ ⎬ ⎫ ​ = ⋅ ⎩ ⎨ ⎧ ​ ∑ i = 1 m ​ σ ( X b i ​ θ ) − y i ​ ∑ i = 1 m ​ ( σ ( X b i ​ θ ) − y i ​ ) ⋅ X 1 i ​ ⋅⋅⋅ ∑ i = 1 m ​ ( σ ( X b i ​ θ ) − y i ​ ) ⋅ X n i ​ ​ ⎭ ⎬ ⎫ ​ = ⋅ X b T ​ ⋅ ( σ ( X b ​ θ ) − y )

## 解决二分类问题：鸢尾花分类

import numpy as np
from .metrics import accuracy_score
class LogisticRegression:
def __init__(self):
"""初始化Logistic Regression模型"""
self.coef_ = None
self.intercept_ = None
self._theta = None
def _sigmoid(self, t):
return 1. / (1. + np.exp(-t))
def fit(self, X_train, y_train, eta=0.01, n_iters=1e4):
"""根据训练数据集X_train, y_train, 使用批量-梯度下降法训练Logistic Regression模型"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
def J(theta, X_b, y):
y_hat = self._sigmoid(X_b.dot(theta))
try:
return - np.sum(y*np.log(y_hat) + (1-y)*np.log(1-y_hat)) / len(y)
except:
return float('inf')
def dJ(theta, X_b, y):
return X_b.T.dot(self._sigmoid(X_b.dot(theta)) - y) / len(y)
def gradient_descent(X_b, y, initial_theta, eta, n_iters=1e4, epsilon=1e-8):
theta = initial_theta
cur_iter = 0
while cur_iter < n_iters:
gradient = dJ(theta, X_b, y)
last_theta = theta
theta = theta - eta * gradient
if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
break
cur_iter += 1
return theta
X_b = np.hstack([np.ones((len(X_train), 1)), X_train])
initial_theta = np.zeros(X_b.shape[1])
self._theta = gradient_descent(X_b, y_train, initial_theta, eta, n_iters)
self.intercept_ = self._theta[0]
self.coef_ = self._theta[1:]
return self
def predict_proba(self, X_predict):
"""给定待预测数据集X_predict，返回表示X_predict的结果概率向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
X_b = np.hstack([np.ones((len(X_predict), 1)), X_predict])
return self._sigmoid(X_b.dot(self._theta))
def predict(self, X_predict):
"""给定待预测数据集X_predict，返回表示X_predict的结果向量"""
assert self.intercept_ is not None and self.coef_ is not None, \
"must fit before predict!"
assert X_predict.shape[1] == len(self.coef_), \
"the feature number of X_predict must be equal to X_train"
proba = self.predict_proba(X_predict)
return np.array(proba >= 0.5, dtype='int')
def score(self, X_test, y_test):
"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(X_test)
return accuracy_score(y_test, y_predict)
def __repr__(self):
return "LogisticRegression()"

sklearn 调包：

import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris                 # 导入鸢尾花数据集
from sklearn.model_selection import train_test_split   # 导入数据划分函数
from sklearn.linear_model import LogisticRegression    # 导入逻辑回归

# 导入评价指标
from sklearn.metrics import accuracy_score

iris_X = iris.data[:100, ]     # x有4个属性，共有100个样本，鸢尾花的label原本是3类，这里为了展示二分类，我只取了鸢尾花的前100个数据，也就是label只有0和1
iris_y = iris.target[:100, ]   # y的取值有2个，分别是0,1
model = LogisticRegression()   # 选逻辑回归作为分类器
model.fit(X_train, y_train)    # 训练模型
y_test_pred = model.predict(X=X_test)   # 预测测试集的label

print(y_test_pred)             # 模型预测的测试集label
print(y_test)                  # 测试集实际label
accuracy_score(y_test, y_test_pred) # 查看模型预测的准确率

## 解决多分类问题：OvR、OvO

OvR： 一对剩余所有

n n n

n n n

from sklearn.multiclass import OneVsRestClassifier
ovr = OneVsRestClassifier(log_reg)
ovr.fit(X_train, y_train)
ovr.score(X_test, y_test)

OvO：一对一

n n n

C ( n ,   2 ) C(n,~2) C ( n ,   2 )

from sklearn.multiclass import OneVsOneClassifier
ovo = OneVsOneClassifier(log_reg)
ovo.fit(X_train, y_train)
ovo.score(X_test, y_test)