Press "Enter" to skip to content

## 2.2 依赖准备

```pip install numpy
pip install scipy
pip install pandas
pip install jupyter
pip install requests```

```import pandas as pd
import numpy as np
df = pd.read_csv('resource/events.csv')
df.shape
print(df.head())```

timestamp：时间戳
visitorid：用户ID
event：事件类型
itemid：物品ID
transactionid：事务ID

`print(df.event.unique())`

```trans = df[df['event'] == 'transaction']
trans.shape
print(trans.head())```

```visitors = trans['visitorid'].unique()
items = trans['itemid'].unique()
print(visitors.shape)
print(items.shape)```

```trans2 = trans.groupby(['visitorid']).head(50)
print(trans2.shape)```

```trans2['visitors'] = trans2['visitorid'].apply(lambda x : np.argwhere(visitors == x)[0][0])
trans2['items'] = trans2['itemid'].apply(lambda x : np.argwhere(items == x)[0][0])
print(trans2)```

## 2.3 构建矩阵

### 2.3.1 构建用户-物品矩阵

`from scipy.sparse import csr_matrix`

```occurences = csr_matrix((visitors.shape[0], items.shape[0]), dtype='int8')
def set_occurences(visitor, item):
occurences[visitor, item] += 1
trans2.apply(lambda row: set_occurences(row['visitors'], row['items']), axis=1)
print(occurences)```

```(0, 0)        1
(1, 1)        1
(1, 37)       1
(1, 72)       1
(1, 108)      1
(1, 130)      1
(1, 131)      1
(1, 132)      1
(1, 133)      1
(1, 162)      1
(1, 163)      1
(1, 164)      1
(2, 2)        1
(3, 3)        1
(3, 161)      1
(4, 4)        1
(4, 40)       1
(5, 5)        1
(5, 6)        1
(5, 18)       1
(5, 19)       1
(5, 54)       1
(5, 101)      1
(5, 111)      1
(5, 113)      1
:     :
(11695, 383)  1
(11696, 12007)        1
(11696, 12021)        1
(11697, 12008)        1
(11698, 12011)        1
(11699, 1190) 1
(11700, 506)  1
(11701, 11936)        1
(11702, 10796)        1
(11703, 12013)        1
(11704, 12016)        1
(11705, 12017)        1
(11706, 674)  1
(11707, 3653) 1
(11708, 12018)        1
(11709, 12019)        1
(11710, 1330) 1
(11711, 4184) 1
(11712, 3595) 1
(11713, 12023)        1
(11714, 3693) 1
(11715, 5690) 1
(11716, 6280) 1
(11717, 3246) 1
(11718, 2419) 1
View Code```

### 2.3.2 构建物品-物品共生矩阵

```cooc = occurences.transpose().dot(occurences)
cooc.setdiag(0)
print(cooc)```

```(0, 0)        0
(164, 1)      1
(163, 1)      1
(162, 1)      1
(133, 1)      1
(132, 1)      1
(131, 1)      1
(130, 1)      1
(108, 1)      1
(72, 1)       1
(37, 1)       1
(1, 1)        0
(2, 2)        0
(161, 3)      1
(3, 3)        0
(40, 4)       1
(4, 4)        0
(8228, 5)     1
(8197, 5)     1
(8041, 5)     1
(8019, 5)     1
(8014, 5)     1
(8009, 5)     1
(8008, 5)     1
(7985, 5)     1
:     :
(11997, 12022)        1
(2891, 12022) 1
(12023, 12023)        0
(12024, 12024)        0
(11971, 12024)        1
(11880, 12024)        1
(10726, 12024)        1
(8694, 12024) 1
(4984, 12024) 1
(4770, 12024) 1
(4767, 12024) 1
(4765, 12024) 1
(4739, 12024) 1
(4720, 12024) 1
(4716, 12024) 1
(4715, 12024) 1
(4306, 12024) 1
(2630, 12024) 1
(2133, 12024) 1
(978, 12024)  1
(887, 12024)  1
(851, 12024)  1
(768, 12024)  1
(734, 12024)  1
(220, 12024)  1
View Code```

 事件A 事件B 事件B A和B同时发生（K_11） B发生，单A不发生（K_12） 任何事件但不包含B A发生，但是B不发生（K_21） A和B都不发生（K_22）

`LLR=2 sum(k)(H(k)-H(rowSums(k))-H(colSums(k)))`

```def xLogX(x):
return x * np.log(x) if x != 0 else 0.0
def entropy(x1, x2=0, x3=0, x4=0):
return xLogX(x1 + x2 + x3 + x4) - xLogX(x1) - xLogX(x2) - xLogX(x3) - xLogX(x4)
def LLR(k11, k12, k21, k22):
rowEntropy = entropy(k11 + k12, k21 + k22)
columnEntropy = entropy(k11 + k21, k12 + k22)
matrixEntropy = entropy(k11, k12, k21, k22)
if rowEntropy + columnEntropy < matrixEntropy:
return 0.0
return 2.0 * (rowEntropy + columnEntropy - matrixEntropy)
def rootLLR(k11, k12, k21, k22):
llr = LLR(k11, k12, k21, k22)
sqrt = np.sqrt(llr)
if k11 * 1.0 / (k11 + k12) < k21 * 1.0 / (k21 + k22):
sqrt = -sqrt
return sqrt```

K11：两个事件都发送
K12：事件B发送，而事件A不发生
K21：事件A发送，而事件B不发生
K22：事件A和B都不发生

```row_sum = np.sum(cooc, axis=0).A.flatten()
column_sum = np.sum(cooc, axis=1).A.flatten()
total = np.sum(row_sum, axis=0)
pp_score = csr_matrix((cooc.shape[0], cooc.shape[1]), dtype='double')
cx = cooc.tocoo()
for i,j,v in zip(cx.row, cx.col, cx.data):
if v != 0:
k11 = v
k12 = row_sum[i] - k11
k21 = column_sum[j] - k11
k22 = total - k11 - k12 - k21
pp_score[i,j] = rootLLR(k11, k12, k21, k22)```

```result = np.flip(np.sort(pp_score.A, axis=1), axis=1)
result_indices = np.flip(np.argsort(pp_score.A, axis=1), axis=1)```

```print(result[8456])
print(result_indices[8456])```

```minLLR = 5
indicators = result[:, :50]
indicators[indicators < minLLR] = 0.0
indicators_indices = result_indices[:, :50]
max_indicator_indices = (indicators==0).argmax(axis=1)
max = max_indicator_indices.max()
indicators = indicators[:, :max+1]
indicators_indices = indicators_indices[:, :max+1]```

```import requests
import json```

```actions = []
for i in range(indicators.shape[0]):
length = indicators[i].nonzero()[0].shape[0]
real_indicators = items[indicators_indices[i, :length]].astype("int").tolist()
id = items[i]

action = { "index" : { "_index" : "items2", "_id" : str(id) } }

data = {
"id": int(id),
"indicators": real_indicators
}

actions.append(json.dumps(action))
actions.append(json.dumps(data))

if len(actions) == 200:
actions_string = "\n".join(actions) + "\n"
actions = []

url = "http://127.0.0.1:9200/_bulk/"
headers = {
"Content-Type" : "application/x-ndjson"
}
requests.post(url, headers=headers, data=actions_string)
if len(actions) > 0:
actions_string = "\n".join(actions) + "\n"
actions = []
url = "http://127.0.0.1:9200/_bulk/"
headers = {
"Content-Type" : "application/x-ndjson"
}
requests.post(url, headers=headers, data=actions_string)```

## 3.总结

```import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import requests
import json
df = pd.read_csv('resource/events.csv')
# print(df.shape)
# print(df.head())
# print(df.event.unique())
trans = df[df['event'] == 'transaction']
# print(trans.shape)
# print(trans.head())
visitors = trans['visitorid'].unique()
items = trans['itemid'].unique()
# print(visitors.shape)
# print(items.shape)
trans2 = trans.groupby(['visitorid']).head(50)
# print(trans2.shape)
trans2['visitors'] = trans2['visitorid'].apply(lambda x : np.argwhere(visitors == x)[0][0])
trans2['items'] = trans2['itemid'].apply(lambda x : np.argwhere(items == x)[0][0])
# print(trans2)
occurences = csr_matrix((visitors.shape[0], items.shape[0]), dtype='int8')
def set_occurences(visitor, item):
occurences[visitor, item] += 1
trans2.apply(lambda row: set_occurences(row['visitors'], row['items']), axis=1)
# print(occurences)
cooc = occurences.transpose().dot(occurences)
cooc.setdiag(0)
# print(cooc)
def xLogX(x):
return x * np.log(x) if x != 0 else 0.0
def entropy(x1, x2=0, x3=0, x4=0):
return xLogX(x1 + x2 + x3 + x4) - xLogX(x1) - xLogX(x2) - xLogX(x3) - xLogX(x4)
def LLR(k11, k12, k21, k22):
rowEntropy = entropy(k11 + k12, k21 + k22)
columnEntropy = entropy(k11 + k21, k12 + k22)
matrixEntropy = entropy(k11, k12, k21, k22)
if rowEntropy + columnEntropy < matrixEntropy:
return 0.0
return 2.0 * (rowEntropy + columnEntropy - matrixEntropy)
def rootLLR(k11, k12, k21, k22):
llr = LLR(k11, k12, k21, k22)
sqrt = np.sqrt(llr)
if k11 * 1.0 / (k11 + k12) < k21 * 1.0 / (k21 + k22):
sqrt = -sqrt
return sqrt
row_sum = np.sum(cooc, axis=0).A.flatten()
column_sum = np.sum(cooc, axis=1).A.flatten()
total = np.sum(row_sum, axis=0)
pp_score = csr_matrix((cooc.shape[0], cooc.shape[1]), dtype='double')
cx = cooc.tocoo()
for i,j,v in zip(cx.row, cx.col, cx.data):
if v != 0:
k11 = v
k12 = row_sum[i] - k11
k21 = column_sum[j] - k11
k22 = total - k11 - k12 - k21
pp_score[i,j] = rootLLR(k11, k12, k21, k22)

result = np.flip(np.sort(pp_score.A, axis=1), axis=1)
result_indices = np.flip(np.argsort(pp_score.A, axis=1), axis=1)
print(result.shape)
print(result[8456])
print(result_indices[8456])
minLLR = 5
indicators = result[:, :50]
indicators[indicators < minLLR] = 0.0
indicators_indices = result_indices[:, :50]
max_indicator_indices = (indicators==0).argmax(axis=1)
max = max_indicator_indices.max()
indicators = indicators[:, :max+1]
indicators_indices = indicators_indices[:, :max+1]
actions = []
for i in range(indicators.shape[0]):
length = indicators[i].nonzero()[0].shape[0]
real_indicators = items[indicators_indices[i, :length]].astype("int").tolist()
id = items[i]

action = { "index" : { "_index" : "items2", "_id" : str(id) } }

data = {
"id": int(id),
"indicators": real_indicators
}

actions.append(json.dumps(action))
actions.append(json.dumps(data))

if len(actions) == 200:
actions_string = "\n".join(actions) + "\n"
actions = []

url = "http://127.0.0.1:9200/_bulk/"
headers = {
"Content-Type" : "application/x-ndjson"
}
requests.post(url, headers=headers, data=actions_string)
if len(actions) > 0:
actions_string = "\n".join(actions) + "\n"
actions = []
url = "http://127.0.0.1:9200/_bulk/"
headers = {
"Content-Type" : "application/x-ndjson"
}
requests.post(url, headers=headers, data=actions_string)
View Code```