makemore

makemore是个二元语言模型,也就是上下文长度只有1,根据前一个字符预测后一个字符是什么,已有数据集names.txt记录了一些人名,现在希望利用模型得到一些人的名称

基于统计的方法

基于统计的方法对数据集中连续的字符对进行统计,得到一张表,通过这张表,可知某个字符i后,字符j的出现次数,基于这张表,可计算出字符i后,字符j的出现概率,然后通过概率抽样的方法得到下一个字符

对于数据集中的每个句子,在前面和后面分别加上.符号,所以第一个字符是抽样.字符后字符出现概率得到的,如果抽样抽到了.,则说明句子结束

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import torch
import matplotlib.pyplot as plt

N = torch.zeros((27,27),dtype=torch.int32)
words = open("names.txt", "r").read().splitlines()
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

# 统计字母对的出现次数
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1,ch2 in zip(chs, chs[1:]):
N[stoi[ch1],stoi[ch2]] += 1

# 绘图
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
for j in range(27):
chstr = itos[i] + itos[j]
plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off')
plt.savefig('bigram.png')

image-20250506193942162

上面的table显示了字符对的出现频率,其中..出现了0次,因为数据集中没有空的句子

1
2
3
4
5
6
7
8
9
10
11
12
13
14
P = (N+1).float()
P /= P.sum(1, keepdim=True) # 归一化

for i in range(5):
out = []
ix = 0
while True:
p = P[ix]
ix = torch.multinomial(p, num_samples=1, replacement=True).item() #根据概率采样
out.append(itos[ix])
if ix == 0:
break
print(''.join(out))

利用构建好的概率模型进行推理,P对原先的N的概率模型进行了平滑处理,因为一些字符对在N中完全没有出现过,这样使这样的字符对也能被取到,此外在一些数学计算上也避免出现异常,下面是生成的一些名字,还怪正常的哈哈

image-20250506195144111

采用损失函数统计模型的好坏,也就是对于数据集中的每个句子,我们希望概率模型的预测结果和真实结果一致,用损失展示模型的好坏

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# log(a*b*c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
n = 0

for w in words:
chs = ['.'] + list(w) + ['.']
for ch1,ch2 in zip(chs, chs[1:]):
p = P[stoi[ch1], stoi[ch2]]
log_likelihood += torch.log(p)
n += 1

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

如果模型完全预测正确,也就是对应项p=1,此时log值为0,最终的损失为0,否则p越小,对应的log越负,负log结果得到的损失越大

1
2
3
log_likelihood=tensor(-559951.5625)
nll=tensor(559951.5625)
2.4543561935424805

采用神经网络

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
print(ch1, ch2)
xs.append(ix1)
ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
W = torch.randn((27, 27)) # 27个神经元(一排)

xenc = F.one_hot(xs, num_classes=27).float()
logist = xenc @ W
counts = torch.exp(logist)
probs = counts / counts.sum(1, keepdim=True)
print(probs.shape)
nlls = torch.zeros(5)
for i in range(5):
# i-th bigram:
x = xs[i].item() # input character index
y = ys[i].item() # label character index
print('--------')
print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
print('input to the neural net:', x)
print('output probabilities from the neural net:', probs[i])
print('label (actual next character):', y)
p = probs[i, y]
print('probability assigned by the net to the the correct character:', p.item())
logp = torch.log(p)
print('log likelihood:', logp.item())
nll = -logp
print('negative log likelihood:', nll.item())
nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

W是已有的神经网络,这里是个参数随机初始化的一排27个神经元组成的神经网络,输出27个维度,后接softmax操作(先经过exp,然后经过归一化),得到类似于统计方法的概率分布。可以看做经过神经网络得到的是出现次数log后的结果,exp之后得到了计数,归一化得到了概率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# --------- !!! OPTIMIZATION !!! yay, but this time actually --------------
# create the dataset
xs, ys = [], []
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
xs.append(ix1)
ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
W = torch.randn((27, 27), requires_grad=True)

# gradient descent
for k in range(100):
# forward pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
print(loss.item())

# backward pass
W.grad = None # set to zero the gradient
loss.backward()

# update
W.data += -50 * W.grad

训练神经网络,完成了整个步骤,这里的loss除了负log的损失之外,为了让模型平滑(类似于之前的P=N+1的效果),加入了正则化项0.01*(W**2).mean(),让W的参数不要太大


所有的代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F

N = torch.zeros((27,27),dtype=torch.int32)
words = open("names.txt", "r").read().splitlines()
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

# 统计字母对的出现次数
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1,ch2 in zip(chs, chs[1:]):
N[stoi[ch1],stoi[ch2]] += 1

# 绘图
plt.figure(figsize=(16,16))
plt.imshow(N, cmap='Blues')
for i in range(27):
for j in range(27):
chstr = itos[i] + itos[j]
plt.text(j, i, chstr, ha="center", va="bottom", color='gray')
plt.text(j, i, N[i, j].item(), ha="center", va="top", color='gray')
plt.axis('off')
# plt.savefig('bigram.png')

P = (N+1).float()
P /= P.sum(1, keepdim=True) # 归一化

for i in range(5):
out = []
ix = 0
while True:
p = P[ix]
ix = torch.multinomial(p, num_samples=1, replacement=True).item() #根据概率采样
out.append(itos[ix])
if ix == 0:
break
print(''.join(out))

# GOAL: maximize likelihood of the data w.r.t. model parameters (statistical modeling)
# equivalent to maximizing the log likelihood (because log is monotonic)
# equivalent to minimizing the negative log likelihood
# equivalent to minimizing the average negative log likelihood

# log(a*b*c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
n = 0

for w in words:
chs = ['.'] + list(w) + ['.']
for ch1,ch2 in zip(chs, chs[1:]):
p = P[stoi[ch1], stoi[ch2]]
log_likelihood += torch.log(p)
n += 1

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')
print('========================================')

# create the training set of bigrams (x,y)
xs, ys = [], []

for w in words[:1]:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
print(ch1, ch2)
xs.append(ix1)
ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
W = torch.randn((27, 27))


xenc = F.one_hot(xs, num_classes=27).float()
logist = xenc @ W
counts = torch.exp(logist)
probs = counts / counts.sum(1, keepdim=True)
print(probs.shape)
nlls = torch.zeros(5)
for i in range(5):
# i-th bigram:
x = xs[i].item() # input character index
y = ys[i].item() # label character index
print('--------')
print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
print('input to the neural net:', x)
print('output probabilities from the neural net:', probs[i])
print('label (actual next character):', y)
p = probs[i, y]
print('probability assigned by the net to the the correct character:', p.item())
logp = torch.log(p)
print('log likelihood:', logp.item())
nll = -logp
print('negative log likelihood:', nll.item())
nlls[i] = nll

print('=========')
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

print('==================')
# --------- !!! OPTIMIZATION !!! yay, but this time actually --------------
# create the dataset
xs, ys = [], []
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
ix1 = stoi[ch1]
ix2 = stoi[ch2]
xs.append(ix1)
ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
W = torch.randn((27, 27), requires_grad=True)
# gradient descent
for k in range(1000):
# forward pass
xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
print(loss.item())

# backward pass
W.grad = None # set to zero the gradient
loss.backward()

# update
W.data += -50 * W.grad


for i in range(5):
out = []
ix = 0
while True:
# ----------
# BEFORE:
#p = P[ix]
# ----------
# NOW:
xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
logits = xenc @ W # predict log-counts
counts = logits.exp() # counts, equivalent to N
p = counts / counts.sum(1, keepdims=True) # probabilities for next character
# ----------

ix = torch.multinomial(p, num_samples=1, replacement=True).item()
out.append(itos[ix])
if ix == 0:
break
print(''.join(out))