python notes

  1. 对Corpus (List of tokens)快速建立词频表
1
2
3
4
collections.Counter(words).most_common(vocabulary_size - 1)
# collections.Counter(words): Counter({token: frequency ...})
# Counter.most_common(n): [(token, frequency)...] : n most frequent tokens
  1. List 拼接
1
2
a + b
a.extend(b) # equals to a = a + b
  1. 创建 reverse dictionary
1
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

以上几段代码来自Udacity Deep Learning course assignment 5, 具体如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
vocabulary_size = 50000
def (words):
# UNK means "Unknown"
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
# create token-freq pairs
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary) # build index
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count + 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
# data: documents as list of indexes
# count: List of (token, freq) pairs
# dictionary: {token, index}
# reverse_dictionary: {index, token}
data, count, dictionary, reverse_dictionary = build_dataset(words)