Python中多项式 朴素贝叶斯算法 的本机实践

```defbuildGlobalVocab(self):
vocab = []
for doc in self.docs:
vocab.extend(self.cleanDoc(doc))
return np.unique(vocab)
defbuildClassVocab(self, _cls):
curr_word_list = []
for idx, doc inenumerate(self.docs):
if self.classes[idx] == _cls:
curr_word_list.extend(self.cleanDoc(doc))
if _cls notinself.class_vocab:
self.class_vocab[_cls]=curr_word_list
else:
self.class_vocab[_cls].append(curr_word_list)```

```@staticmethod
defcleanDoc(doc):
return re.sub(r [^a-zd ] ,   , doc.lower()).split(   )```

```deffit(self, x, y):
self.docs = x
self.classes = y
num_doc =len(self.docs)
uniq_cls = np.unique(self.classes)
self.vocab = self.buildGlobalVocab()
vocab_cnt =len(self.vocab)
for _cls in uniq_cls:
cls_docs_num = self.countCls(_cls)
self.logprior[_cls] = np.log(cls_docs_num/num_doc)
self.buildClassVocab(_cls)
class_vocab_counter =Counter(self.class_vocab[_cls])
class_vocab_cnt =len(self.class_vocab[_cls])
for word in self.vocab:
w_cnt =class_vocab_counter[word]
self.loglikelihood[word, _cls] = np.log((w_cnt +1)/(class_vocab_cnt+ vocab_cnt))```

```defpredict(self,test_docs):
output = []
for doc in test_docs:
uniq_cls = np.unique(self.classes)
sum =dict()
for  _cls in uniq_cls:
sum[_cls] =self.logprior[_cls]
for word in self.cleanDoc(doc):
if word in self.vocab:
try:
sum[_cls] +=self.loglikelihood[word, _cls]
except:
print(sum, _cls)
result = np.argmax(list(sum.values()))
output.append(uniq_cls[result])
return output```

```defreadFile(self, size =70000, testSize =0.3):
lines = pd.read_csv("data/news_aggregator.csv", nrows = size);
x = lines.TITLE
y = lines.CATEGORY
skip =round(size * (1- testSize))
x_train, y_train, x_test, y_test = x[:skip],y[:skip], x[skip:size], y[skip:size]
print( Train data:  , len(x_train),  Testing data:  , len(x_test),  Total:  , len(x))
return x_train, y_train,x_test, y_test```

```defmain(self):
x_train, y_train, x_test, y_test = self.readFile(size =50000, testSize=0.3)
nb =MultinominalNB()
nb.fit(x_train, y_train)
predictions = nb.predict(x_test)
print( Accuracy:  , self.accuracy(predictions,y_test))```

```defsaveModel(self):
try:
f =open("models/classifier", "wb")
pickle.dump([self.logprior,self.vocab, self.loglikelihood, self.classes], f)
f.close()
except:
print( Error savingthe model )
@staticmethod
try:
f =open("models/classifier", "rb")
f.close()
return model
except:

```deffit(self, x, y,save =False):
self.docs = x
self.classes = y
num_doc =len(self.docs)
uniq_cls = np.unique(self.classes)
self.vocab = self.buildGlobalVocab()
vocab_cnt =len(self.vocab)
t =time()
for _cls in uniq_cls:
cls_docs_num = self.countCls(_cls)
self.logprior[_cls] = np.log(cls_docs_num/num_doc)
self.buildClassVocab(_cls)
class_vocab_counter =Counter(self.class_vocab[_cls])
class_vocab_cnt =len(self.class_vocab[_cls])
for word in self.vocab:
w_cnt =class_vocab_counter[word]
self.loglikelihood[word, _cls] = np.log((w_cnt +1)/(class_vocab_cnt+ vocab_cnt))
if save:
self.saveModel()
print( Trainingfinished at {} mins. .format(round((time() - t) /60, 2)))
defpredict(self,test_docs,cached =False):
output = []
ifnot cached:
logprior = self.logprior
vocab = self.vocab
loglikelihood = self.loglikelihood
classes = self.classes
else:
logprior, vocab, loglikelihood, classes = self.readModel()
for doc in test_docs:
uniq_cls = np.unique(classes)
sum =dict()
for  _cls in uniq_cls:
sum[_cls] = logprior[_cls]
for word in self.cleanDoc(doc):
if word in vocab:
try:
sum[_cls] +=loglikelihood[word, _cls]
except:
print(sum, _cls)
result = np.argmax(list(sum.values()))
output.append(uniq_cls[result])
return output```

```defmain(self):
x_train, y_train, x_test, y_test = self.readFile(size =50000, testSize=0.3)
nb =MultinominalNB()
"""
Run the code below the first time you runthe script
nb.fit(x_train,y_train, save = True)
"""
predictions = nb.predict([ Google launchesa new app. ], cached =True)
print(predictions)```