NN zero to hero (P2): makemore-bigram

makemore

makemore是个二元语言模型，也就是上下文长度只有1，根据前一个字符预测后一个字符是什么，已有数据集names.txt记录了一些人名，现在希望利用模型得到一些人的名称

基于统计的方法

基于统计的方法对数据集中连续的字符对进行统计，得到一张表，通过这张表，可知某个字符i后，字符j的出现次数，基于这张表，可计算出字符i后，字符j的出现概率，然后通过概率抽样的方法得到下一个字符

对于数据集中的每个句子，在前面和后面分别加上.符号，所以第一个字符是抽样.字符后字符出现概率得到的，如果抽样抽到了.，则说明句子结束

import torch
import matplotlib.pyplot as plt

N = torch.zeros((27,27),dtype=torch.int32)
words = open("names.txt", "r").read().splitlines()
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

# 统计字母对的出现次数
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1,ch2 in zip(chs, chs[1:]):
        N[stoi[ch1],stoi[ch2]] += 1

# 绘图
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off')
plt.savefig('bigram.png')

上面的table显示了字符对的出现频率，其中..出现了0次，因为数据集中没有空的句子

P = (N+1).float()
P /= P.sum(1, keepdim=True)  # 归一化

for i in range(5):
    out = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True).item() #根据概率采样
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

利用构建好的概率模型进行推理，P对原先的N的概率模型进行了平滑处理，因为一些字符对在N中完全没有出现过，这样使这样的字符对也能被取到，此外在一些数学计算上也避免出现异常，下面是生成的一些名字，还怪正常的哈哈

采用损失函数统计模型的好坏，也就是对于数据集中的每个句子，我们希望概率模型的预测结果和真实结果一致，用损失展示模型的好坏

# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# log(a*b*c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
n = 0

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1,ch2 in zip(chs, chs[1:]):
        p = P[stoi[ch1], stoi[ch2]]
        log_likelihood += torch.log(p)
        n += 1

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

如果模型完全预测正确，也就是对应项p=1，此时log值为0，最终的损失为0，否则p越小，对应的log越负，负log结果得到的损失越大

1
2
3

log_likelihood=tensor(-559951.5625)
nll=tensor(559951.5625)
2.4543561935424805

采用神经网络

# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    print(ch1, ch2)
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)
W = torch.randn((27, 27)) # 27个神经元（一排）

xenc = F.one_hot(xs, num_classes=27).float()
logist = xenc @ W
counts = torch.exp(logist)
probs = counts / counts.sum(1, keepdim=True)
print(probs.shape)
nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

W是已有的神经网络，这里是个参数随机初始化的一排27个神经元组成的神经网络，输出27个维度，后接softmax操作（先经过exp，然后经过归一化），得到类似于统计方法的概率分布。可以看做经过神经网络得到的是出现次数log后的结果，exp之后得到了计数，归一化得到了概率

# --------- !!! OPTIMIZATION !!! yay, but this time actually --------------
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
W = torch.randn((27, 27),  requires_grad=True)

# gradient descent
for k in range(100):
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

训练神经网络，完成了整个步骤，这里的loss除了负log的损失之外，为了让模型平滑（类似于之前的P=N+1的效果），加入了正则化项0.01*(W**2).mean()，让W的参数不要太大

所有的代码：

import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F

N = torch.zeros((27,27),dtype=torch.int32)
words = open("names.txt", "r").read().splitlines()
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

# 统计字母对的出现次数
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1,ch2 in zip(chs, chs[1:]):
        N[stoi[ch1],stoi[ch2]] += 1

# 绘图
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
    for j in range(27):
        chstr = itos[i] + itos[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off')
# plt.savefig('bigram.png')

P = (N+1).float()
P /= P.sum(1, keepdim=True)  # 归一化

for i in range(5):
    out = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True).item() #根据概率采样
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# log(a*b*c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
n = 0

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1,ch2 in zip(chs, chs[1:]):
        p = P[stoi[ch1], stoi[ch2]]
        log_likelihood += torch.log(p)
        n += 1

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')
print('========================================')

# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    print(ch1, ch2)
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)
W = torch.randn((27, 27))


xenc = F.one_hot(xs, num_classes=27).float()
logist = xenc @ W
counts = torch.exp(logist)
probs = counts / counts.sum(1, keepdim=True)
print(probs.shape)
nlls = torch.zeros(5)
for i in range(5):
  # i-th bigram:
  x = xs[i].item() # input character index
  y = ys[i].item() # label character index
  print('--------')
  print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
  print('input to the neural net:', x)
  print('output probabilities from the neural net:', probs[i])
  print('label (actual next character):', y)
  p = probs[i, y]
  print('probability assigned by the net to the the correct character:', p.item())
  logp = torch.log(p)
  print('log likelihood:', logp.item())
  nll = -logp
  print('negative log likelihood:', nll.item())
  nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

print('==================')
# --------- !!! OPTIMIZATION !!! yay, but this time actually --------------
# create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
W = torch.randn((27, 27),  requires_grad=True)
# gradient descent
for k in range(1000):
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad


for i in range(5):
    out = []
    ix = 0
    while True:
        # ----------
        # BEFORE:
        #p = P[ix]
        # ----------
        # NOW:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True) # probabilities for next character
        # ----------
        
        ix = torch.multinomial(p, num_samples=1, replacement=True).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))