逻辑回归算法原理

损失函数（Cost Function）

samples = [(-5, 1), (-20, 0), (-2, 1)]

def sigmoid(theta, x):
return 1/(1 + math.e**(- theta*x))

def cost(theta):
diffs = [(sigmoid(theta, x) - y) for x,y in samples]
return sum(diff * diff for diff in diffs)/len(samples)/2

X = np.arange(-1, 1, 0.01)
Y = np.array([cost(theta) for theta in X])
plt.plot(X, Y)
plt.show()

使用Scikit-Learn进行逻辑回归

L1/L2范数

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

nb_samples = 500
X, Y = make_classification(n_samples=nb_samples, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

lr = LogisticRegression()
lr.fit(X_train,Y_train)

train_score = lr.score(X_train, Y_train)  # 模型对训练样本得准确性
test_score = lr.score(X_test, Y_test)  # 模型对测试集的准确性
print(train_score)
print(test_score)

GridSearchCV：

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {
'penalty': ["l1", "l2"],
'C': np.power(10.0, np.arange(-10, 10))
}

gs = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, scoring='accuracy', cv=10)

gs.fit(iris.data, iris.target)
print(gs.best_estimator_)

LogisticRegressionCV：

from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import KFold
import numpy as np

fold = KFold(n_splits=5, shuffle=True, random_state=777)

searchCV = LogisticRegressionCV(
Cs=list(np.power(10.0, np.arange(-10, 10)))
, penalty='l2'
, scoring='roc_auc'
, cv=fold
, random_state=777
, max_iter=10000
, fit_intercept=True
, solver='newton-cg'
, tol=10
)

searchCV.fit(iris.data, iris.target)
print('Max auc_roc:', searchCV.scores_[1].mean(axis=0).max())