### PyTorch中的nn.Embedding

import torch
import torch.nn as nn
embeds = nn.Embedding(2, 5)
embeds.weight
# 输出：
Parameter containing:
tensor([[-1.1454,  0.3675, -0.3718,  0.3733,  0.5979],
[-0.7952, -0.9794,  0.6292, -0.3633, -0.2037]], requires_grad=True)

pretrained_weight = np.array(pretrained_weight)
embeds.weight.data.copy_(torch.from_numpy(pretrained_weight))

embeds = nn.Embedding(100, 10)
embeds(torch.LongTensor([50]))
# 输出
tensor([[-1.9562e-03,  1.8971e+00,  7.0230e-01, -6.3762e-01, -1.9426e-01,
3.4200e-01, -2.0908e+00, -3.0827e-01,  9.6250e-01, -7.2700e-01]],
grad_fn=<EmbeddingBackward>)

### 过程详解

1. 提取文章所有的单词，把所有的单词按照频次降序排序（取前4999个，表示常出现的单词。其余所有单词均用'<UNK>’表示。所以一共有5000个单词）

1. 500个单词使用one-hot编码

1. 通过训练会生成一个$5000\times 300$的矩阵，每一行向量表示一个词的词向量。这里的300是人为指定，想要每个词最终编码为词向量的维度，你也可以设置成别的

### Pytorch实现

#### 导包

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as tud
from collections import Counter
import numpy as np
import random
import math
import pandas as pd
import scipy
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
C = 3 # context window
K = 15 # number of negative samples
epochs = 2
MAX_VOCAB_SIZE = 10000
EMBEDDING_SIZE = 100
batch_size = 32
lr = 0.2

#### 读取文本数据并处理

with open('text8.train.txt') as f:
text = text.lower().split() #　分割成单词列表
vocab_dict = dict(Counter(text).most_common(MAX_VOCAB_SIZE - 1)) # 得到单词字典表，key是单词，value是次数
vocab_dict['<UNK>'] = len(text) - np.sum(list(vocab_dict.values())) # 把不常用的单词都编码为"<UNK>"
idx2word = [word for word in vocab_dict.keys()]
word2idx = {word:i for i, word in enumerate(idx2word)}
word_counts = np.array([count for count in vocab_dict.values()], dtype=np.float32)
word_freqs = word_counts / np.sum(word_counts)
word_freqs = word_freqs ** (3./4.)

1. 把所有word编码成数字

1. 保存vocabulary，单词count、normalized word frequency

1. 每个iteration sample一个中心词

1. 根据当前的中心词返回context单词

1. 根据中心词sample一些negative单词

1. 返回单词的counts

__len__()
__getitem__(idx)

class WordEmbeddingDataset(tud.Dataset):
def __init__(self, text, word2idx, idx2word, word_freqs, word_counts):
''' text: a list of words, all text from the training dataset
word2idx: the dictionary from word to index
idx2word: index to word mapping
word_freqs: the frequency of each word
word_counts: the word counts
'''
super(WordEmbeddingDataset, self).__init__() # #通过父类初始化模型，然后重写两个方法
self.text_encoded = [word2idx.get(word, word2idx['<UNK>']) for word in text] # 把单词数字化表示。如果不在词典中，也表示为unk
self.text_encoded = torch.LongTensor(self.text_encoded) # nn.Embedding需要传入LongTensor类型
self.word2idx = word2idx
self.idx2word = idx2word
self.word_freqs = torch.Tensor(word_freqs)
self.word_counts = torch.Tensor(word_counts)

def __len__(self):
return len(self.text_encoded) # 返回所有单词的总数，即item的总数

def __getitem__(self, idx):
''' 这个function返回以下数据用于训练
- 中心词
- 这个单词附近的positive word
- 随机采样的K个单词作为negative word
'''
center_words = self.text_encoded[idx] # 取得中心词
pos_indices = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1)) # 先取得中心左右各C个词的索引
pos_indices = [i % len(self.text_encoded) for i in pos_indices] # 为了避免索引越界，所以进行取余处理
pos_words = self.text_encoded[pos_indices] # tensor(list)

neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], True)
# torch.multinomial作用是对self.word_freqs做K * pos_words.shape[0]次取值，输出的是self.word_freqs对应的下标
# 取样方式采用有放回的采样，并且self.word_freqs数值越大，取样概率越大
# 每采样一个正确的单词(positive word)，就采样K个错误的单词(negative word)，pos_words.shape[0]是正确单词数量
return center_words, pos_words, neg_words

a = torch.tensor([2, 3, 3, 8, 4, 6, 7, 8, 1, 3, 5, 0], dtype=torch.long)
b = [2, 3, 5, 6]
print(a[b])
# tensor([3, 8, 6, 7])

dataset = WordEmbeddingDataset(text, word2idx, idx2word, word_freqs, word_counts)
dataloader = tud.DataLoader(dataset, batch_size, shuffle=True)

next(iter(dataset))
'''
(tensor(4813),
tensor([  50, 9999,  393, 3139,   11,    5]),
tensor([  82,    0, 2835,   23,  328,   20, 2580, 6768,   34, 1493,   90,    5,
110,  464, 5760, 5368, 3899, 5249,  776,  883, 8522, 4093,    1, 4159,
5272, 2860, 9999,    6, 4880, 8803, 2778, 7997, 6381,  264, 2560,   32,
7681, 6713,  818, 1219, 1750, 8437, 1611,   12,   42,   24,   22,  448,
9999,   75, 2424, 9970, 1365, 5320,  878,   40, 2585,  790,   19, 2607,
1,   18, 3847, 2135,  174, 3446,  191, 3648, 9717, 3346, 4974,   53,
915,   80,   78, 6408, 4737, 4147, 1925, 4718,  737, 1628, 6160,  894,
9373,   32,  572, 3064,    6,  943]))
'''

#### 定义PyTorch模型

class EmbeddingModel(nn.Module):
def __init__(self, vocab_size, embed_size):
super(EmbeddingModel, self).__init__()

self.vocab_size = vocab_size
self.embed_size = embed_size

self.in_embed = nn.Embedding(self.vocab_size, self.embed_size)
self.out_embed = nn.Embedding(self.vocab_size, self.embed_size)

def forward(self, input_labels, pos_labels, neg_labels):
''' input_labels: center words, [batch_size]
pos_labels: positive words, [batch_size, (window_size * 2)]
neg_labels：negative words, [batch_size, (window_size * 2 * K)]

return: loss, [batch_size]
'''
input_embedding = self.in_embed(input_labels) # [batch_size, embed_size]
pos_embedding = self.in_embed(pos_labels)# [batch_size, (window * 2), embed_size]
neg_embedding = self.in_embed(neg_labels) # [batch_size, (window * 2 * K), embed_size]

input_embedding = input_embedding.unsqueeze(2) # [batch_size, embed_size, 1]

pos_dot = torch.bmm(pos_embedding, input_embedding) # [batch_size, (window * 2), 1]
pos_dot = pos_dot.squeeze(2) # [batch_size, (window * 2)]

neg_dot = torch.bmm(neg_embedding, -input_embedding) # [batch_size, (window * 2 * K), 1]
neg_dot = neg_dot.squeeze(2) # batch_size, (window * 2 * K)]

log_pos = F.logsigmoid(pos_dot).sum(1) # .sum()结果只为一个数，.sum(1)结果是一维的张量
log_neg = F.logsigmoid(neg_dot).sum(1)

loss = log_pos + log_neg

return -loss

def input_embedding(self):
return self.in_embed.weight.numpy()

bmm(a, b) ，batch matrix multiply。函数中的两个参数a,b都是维度为3的tensor，并且这两个tensor的第一个维度必须相同，后面两个维度必须满足矩阵乘法的要求

batch1 = torch.randn(10, 3, 4)
batch2 = torch.randn(10, 4, 5)
res = torch.bmm(batch1, batch2)
print(res.size())
# torch.Size([10, 3, 5])

#### 训练模型

for e in range(1):
for i, (input_labels, pos_labels, neg_labels) in enumerate(dataloader):
input_labels = input_labels.long()
pos_labels = pos_labels.long()
neg_labels = neg_labels.long()
loss = model(input_labels, pos_labels, neg_labels).mean()
loss.backward()
optimizer.step()
if i % 100 == 0:
print('epoch', e, 'iteration', i, loss.item())
embedding_weights = model.input_embeddings()
torch.save(model.state_dict(), "embedding-{}.th".format(EMBEDDING_SIZE))

#### 词向量应用

def find_nearest(word):
index = word2idx[word]
embedding = embedding_weights[index]
cos_dis = np.array([scipy.spatial.distance.cosine(e, embedding) for e in embedding_weights])
return [idx_to_word[i] for i in cos_dis.argsort()[:10]]

for word in ["two", "america", "computer"]:
print(word, find_nearest(word))
# 输出
two ['two', 'zero', 'four', 'one', 'six', 'five', 'three', 'nine', 'eight', 'seven']
america ['america', 'states', 'japan', 'china', 'usa', 'west', 'africa', 'italy', 'united', 'kingdom']
computer ['computer', 'machine', 'earth', 'pc', 'game', 'writing', 'board', 'result', 'code', 'website']