viterbi算法 python版

牛mm细心给我讲了一个小时,终于明白它的含义,然后花了一两节分布式数据库的课实现了。当时牛mm还说不可能这么快实现,结果不可能事还是发生了。发现python果真非常好用。不明白此算法可以看这篇blog http://blog.csdn.net/NirvanaFeng/archive/2009/05/12/4171799.aspx

初始化方法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def InitDicForViterbi(nodes,posw,posdi,n):
newWordList = []
# 解决未登录词
for i in nodes:
if not posw.has_key(i):
newWordList.append(i)
maxPosList = NPos(posdi,n)
print maxPosList
GenerateDicPos(newWordList,maxPosList,posw,posdi)
def InitViterbi(node,posw,posdi):
viStatePath = []
for i in posw[node]:
if i <> "@@@":
print "i:",i
viStatePath.append([posw[node][i]*posdi[i]["@@@"],[i]])
return viStatePath

viterbi算法函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
""" nodes 就是分好的词, posw 是词转换为词性的概率,posdi是词性之间的转换概率,n 是n个最大的词性将此用于未登录词中,
weightNone是未出现的词性转移的概率
nodes format {word1,word2...} , posw format {word1:{pos1:fre,pos2:fre,..."@@@":totalnum},..."@@@":total}
posdi format {pos1:{pos2:fre,pos3:fre...."@@@":total},...."@@@":total}
"""
def Viterbi(nodes,posw,posdi,n,weightNone):
InitDicForViterbi(nodes,posw,posdi,n)
viStatePath = InitViterbi(nodes[0],posw,posdi)
length = len(nodes)
currentNode = 1
while currentNode < length:
currentPosList = posw[nodes[currentNode]]
paths = []
# print "vstate:",viStatePath
for k in currentPosList:
if k <> "@@@":
ajk = weightNone
heap = []
for j in xrange(len(viStatePath)):
# compute every state j to every state k in ti
# temppath = viStatePath[j]
# print "lastpos:",temppath
lastpos = viStatePath[j][1][-1]
lastweight = viStatePath[j][0]
lastposList = posdi[lastpos]
if lastposList.has_key(k):
ajk =lastposList[k]
currentweight = lastweight * ajk * currentPosList[k]
# print "viStatePath:",viStatePath[j][1]
pathNew = [data for data in viStatePath[j][1]]
pathNew.append(k)
# print "pathNew:",pathNew
heappush(heap,[currentweight,pathNew])
# get the max possibility of state k in ti
# print "path:",path
paths.append(nlargest(1,heap)[0])
del viStatePath
viStatePath = paths
# print "paths:",paths
currentNode = currentNode + 1
heap = []
# get the max possibility path
for i in viStatePath:
heappush(heap,i)
return nlargest(1,heap)

结果打印输出函数:

1
2
3
4
5
6
7
8
"""nodes format {word1,word2,...} path is [weight,[pos1,pos2....]]"""
def Result(nodes,path,edcode="utf-8"):
realPath = path[0][1]
ResultPrint(nodes,realPath,edcode)
"""nodes format {word1,word2,...} path is [pos1,pos2....]"""
def ResultPrint(nodes,path,edcode="utf-8"):
for i in xrange(len(nodes)):
print nodes[i].decode(edcode),"/",path[i].decode(edcode)

下面是测试代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
aa = ConvertGBKtoUTF("球球")
bb = ConvertGBKtoUTF("娃娃")
cc = ConvertGBKtoUTF("吃饭")
dd = ConvertGBKtoUTF("好")
ee = ConvertGBKtoUTF("dddwieoewkem")
dictions = {aa:{bb:1,"@@@":4},bb:{cc:2,aa:3,"@@@":40},"@@@":400}
posdi = {"n":{"s":3,"v":3,"@@@":40},"s":{"v":2,"e":3,"@@@":33},"v":{"@@@":1},"@@@":100}
posw = {aa:{"n":1,"v":3,"@@@":29},bb:{"n":1,"s":1,"@@@":19},cc:{"n":1,"@@@":1},"@@@":10002}
nodes = [aa,bb,cc,ee,dd,bb,aa,cc]
path = Viterbi(nodes,posw,posdi,3,0.01)
print path
Result(nodes,path)

这里不要问为啥要encode,然后再decode 因为只有这样才能在屏幕上打印出中文。

  • 本文作者: 帐前卒
  • 本文链接: http://chillyc.info/2009/4290974/
  • 版权声明: 本博客所有文章除特别声明外,只能复制超链接地址,且必须注明出处!