## 数据探索

train = pd.read_csv('train.csv', dtype={'question1': str, 'question2': str})
print('Training dataset row number:', len(train))  # 404290
print('Duplicate question pairs ratio: %.2f' % train.is_duplicate.mean())  # 0.37

What is the step by step guide to invest in share market in india?What is the step by step guide to invest in share market?0
How can I be a good geologist?What should I do to be a great geologist?1
How can I increase the speed of my internet connection while using a VPN?How can Internet speed be increased by hacking through DNS?0

## 特征工程

q1_word_num：问题1中的单词数
q2_length：问题2中的字符数
word_share：问题之间共享单词的比率
same_first_word：如果两个问题的第一个单词相同，则为1，否则为0

def word_share(row):
q1_words = set(word_tokenize(row['question1']))
q2_words = set(word_tokenize(row['question2']))

return len(q1_words.intersection(q2_words)) / (len(q1_words.union(q2_words)))
def same_first_word(row):
q1_words = word_tokenize(row['question1'])
q2_words = word_tokenize(row['question2'])

return float(q1_words[0].lower() == q2_words[0].lower())
# A sample of the features
train['word_share'] = train.apply(word_share, axis=1)
train['q1_word_num'] = train.question1.apply(lambda x: len(word_tokenize(x)))
train['q2_word_num'] = train.question2.apply(lambda x: len(word_tokenize(x)))
train['word_num_difference'] = abs(train.q1_word_num - train.q2_word_num)
train['q1_length'] = train.question1.apply(lambda x: len(x))
train['q2_length'] = train.question2.apply(lambda x: len(x))
train['length_difference'] = abs(train.q1_length - train.q2_length)
train['q1_has_fullstop'] = train.question1.apply(lambda x: int('.' in x))
train['q2_has_fullstop'] = train.question2.apply(lambda x: int('.' in x))
train['q1_has_math_expression'] = train.question1.apply(lambda x: int('[math]' in x))
train['q2_has_math_expression'] = train.question2.apply(lambda x: int('[math]' in x))
train['same_first_word'] = train.apply(same_first_word, axis=1)

## 模型性能

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier(50, n_jobs=8)
model.fit(X_train, y_train)
predictions_proba = model.predict_proba(X_test)
predictions = model.predict(X_test)
log_loss_score = log_loss(y_test, predictions_proba)
acc = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print('Log loss: %.5f' % log_loss_score)  # 0.62923
print('Acc: %.5f' % acc)  # 0.70952
print('F1: %.5f' % f1)  # 0.59173

## 特征的重要性

model = XGBClassifier(n_estimators=500)
model.fit(X, y)

feature_importance = model.feature_importances_

plt.figure(figsize=(16, 6))
plt.yscale('log', nonposy='clip')
plt.bar(range(len(feature_importance)), feature_importance, align='center')
plt.xticks(range(len(feature_importance)), features, rotation='vertical')
plt.title('Feature importance')
plt.ylabel('Importance')
plt.xlabel('Features')
plt.show()

def extract_pruned_features(feature_importances, min_score=0.05):
column_slice = feature_importances[feature_importances['weights'] > min_score]
return column_slice.index.values
pruned_featurse = extract_pruned_features(feature_importances, min_score=0.01)
X_train_reduced = X_train[pruned_featurse]
X_test_reduced = X_test[pruned_featurse]
def fit_and_print_metrics(X_train, y_train, X_test, y_test, model):
model.fit(X_train, y_train)
predictions_proba = model.predict_proba(X_test)

log_loss_score = log_loss(y_test, predictions_proba)
print('Log loss: %.5f' % log_loss_score)

## 基于特征重要度分析的模型性能

model = RandomForestClassifier(50, n_jobs=8)
# LogLoss 0.59251
fit_and_print_metrics(X_train_reduced, y_train, X_test_reduced, y_test, model)

# LogLoss 0.63376
fit_and_print_metrics(X_train, y_train, X_test, y_test, model)