欢迎来到代码驿站!

Python代码

当前位置:首页 > 软件编程 > Python代码

python正向最大匹配分词和逆向最大匹配分词的实例

时间:2020-11-25 12:44:08|栏目:Python代码|点击:

正向最大匹配

# -*- coding:utf-8 -*-
 
CODEC='utf-8'
 
def u(s, encoding):
  'converted other encoding to unicode encoding'
  if isinstance(s, unicode):
    return s
  else:
    return unicode(s, encoding)
 
def fwd_mm_seg(wordDict, maxLen, str):
  'forward max match segment'
  wordList = []
  segStr = str
  segStrLen = len(segStr)
  for word in wordDict:
    print 'word: ', word
  print "\n"
  while segStrLen > 0:
    if segStrLen > maxLen:
      wordLen = maxLen
    else:
      wordLen = segStrLen
    subStr = segStr[0:wordLen]
    print "subStr: ", subStr
    while wordLen > 1:
      if subStr in wordDict:
        print "subStr1: %r" % subStr
        break
      else:
        print "subStr2: %r" % subStr
        wordLen = wordLen - 1
        subStr = subStr[0:wordLen]
#      print "subStr3: ", subStr
    wordList.append(subStr)
    segStr = segStr[wordLen:]
    segStrLen = segStrLen - wordLen
  for wordstr in wordList:
    print "wordstr: ", wordstr
  return wordList
    
      
def main():
  fp_dict = open('words.dic')
  wordDict = {}
  for eachWord in fp_dict:
    wordDict[u(eachWord.strip(), 'utf-8')] = 1
  segStr = u'你好世界hello world'
  print segStr
  wordList = fwd_mm_seg(wordDict, 10, segStr)
  print "==".join(wordList)
  
 
if __name__ == '__main__':
  main()
  

逆向最大匹配

# -*- coding:utf-8 -*-
 
 
def u(s, encoding):
  'converted other encoding to unicode encoding'
  if isinstance(s, unicode):
    return s
  else:
    return unicode(s, encoding)
 
CODEC='utf-8'
 
def bwd_mm_seg(wordDict, maxLen, str):
  'forward max match segment'
  wordList = []
  segStr = str
  segStrLen = len(segStr)
  for word in wordDict:
    print 'word: ', word
  print "\n"
  while segStrLen > 0:
    if segStrLen > maxLen:
      wordLen = maxLen
    else:
      wordLen = segStrLen
    subStr = segStr[-wordLen:None]
    print "subStr: ", subStr
    while wordLen > 1:
      if subStr in wordDict:
        print "subStr1: %r" % subStr
        break
      else:
        print "subStr2: %r" % subStr
        wordLen = wordLen - 1
        subStr = subStr[-wordLen:None]
#      print "subStr3: ", subStr
    wordList.append(subStr)
    segStr = segStr[0: -wordLen]
    segStrLen = segStrLen - wordLen
  wordList.reverse()
  for wordstr in wordList:
    print "wordstr: ", wordstr
  return wordList
    
      
def main():
  fp_dict = open('words.dic')
  wordDict = {}
  for eachWord in fp_dict:
    wordDict[u(eachWord.strip(), 'utf-8')] = 1
  segStr = ur'你好世界hello world'
  print segStr
  wordList = bwd_mm_seg(wordDict, 10, segStr)
  print "==".join(wordList)
 
if __name__ == '__main__':
  main()
  

上一篇:浅谈Python中的模块

栏    目:Python代码

下一篇:python中adb有什么功能

本文标题:python正向最大匹配分词和逆向最大匹配分词的实例

本文地址:http://www.codeinn.net/misctech/26270.html

推荐教程

广告投放 | 联系我们 | 版权申明

重要申明:本站所有的文章、图片、评论等,均由网友发表或上传并维护或收集自网络,属个人行为,与本站立场无关。

如果侵犯了您的权利,请与我们联系,我们将在24小时内进行处理、任何非本站因素导致的法律后果,本站均不负任何责任。

联系QQ:914707363 | 邮箱:codeinn#126.com(#换成@)

Copyright © 2020 代码驿站 版权所有