# build the vocabulary of characters and mappings to/from integers chars = sorted(list(set(''.join(words)))) stoi = {s:i+1for i,s inenumerate(chars)} stoi['.'] = 0 itos = {i:s for s,i in stoi.items()} print(itos)
block_size = 3# context length: how many characters do we take to predict the next one? X, Y = [], [] for w in words: #print(w) context = [0] * block_size for ch in w + '.': ix = stoi[ch] X.append(context) Y.append(ix) #print(''.join(itos[i] for i in context), '--->', itos[ix]) context = context[1:] + [ix] # crop and append X = torch.tensor(X) Y = torch.tensor(Y)
# build the dataset block_size = 3# context length: how many characters do we take to predict the next one?
defbuild_dataset(words): X, Y = [], [] for w in words:
#print(w) context = [0] * block_size for ch in w + '.': ix = stoi[ch] X.append(context) Y.append(ix) #print(''.join(itos[i] for i in context), '--->', itos[ix]) context = context[1:] + [ix] # crop and append
X = torch.tensor(X) Y = torch.tensor(Y) print(X.shape, Y.shape) return X, Y
import random random.seed(42) random.shuffle(words) n1 = int(0.8*len(words)) n2 = int(0.9*len(words))
# training split, dev/validation split, test split # 80%, 10%, 10%
emb = C[Xtr] # (32, 3, 2) h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100) logits = h @ W2 + b2 # (32, 27) loss = F.cross_entropy(logits, Ytr) loss
tensor(2.1426, grad_fn=<NllLossBackward0>)
1 2 3 4 5
emb = C[Xdev] # (32, 3, 2) h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100) logits = h @ W2 + b2 # (32, 27) loss = F.cross_entropy(logits, Ydev) loss
tensor(2.1830, grad_fn=<NllLossBackward0>)
调整后的embedding绘图,不过适合之前的二维,embedding的距离也代表一些含义
1 2 3 4 5 6
# visualize dimensions 0 and 1 of the embedding matrix C for all characters plt.figure(figsize=(8,8)) plt.scatter(C[:,0].data, C[:,1].data, s=200) for i inrange(C.shape[0]): plt.text(C[i,0].item(), C[i,1].item(), itos[i], ha="center", va="center", color='white') plt.grid('minor')