|
# -*- coding: utf-8 -*-
import time
import re
import os
import string
import sys
import math
''' ------------------------------------------------------- '''
#統(tǒng)計關(guān)鍵詞及個數(shù)
def CountKey(fileName, resultName):
try:
#計算文件行數(shù)
lineNums = len(open(fileName,'rU').readlines())
print u'文件行數(shù): ' + str(lineNums)
#統(tǒng)計格式 格式<key:value> <屬性:出現(xiàn)個數(shù)>
i = 0
table = {}
source = open(fileName,r)
result = open(resultName,w)
while i < lineNums:
line = source.readline()
line = line.rstrip('
')
print line
words = line.split( ) #空格分隔
print str(words).decode('string_escape') #list顯示中文
#字典插入與賦值
for word in words:
if word!= and table.has_key(word): #如果存在次數(shù)加1
num = table[word]
table[word] = num + 1
elif word!=: #否則初值為1
table[word] = 1
i = i + 1
#鍵值從大到小排序 函數(shù)原型:sorted(dic,value,reverse)
dic = sorted(table.iteritems(), key = lambda asd:asd[1], reverse = True)
for i in range(len(dic)):
#print 'key=%s, value=%s' % (dic[i][0],dic[i][1])
result.write(<+dic[i][0]+:+str(dic[i][1])+>
)
return dic
except Exception,e:
print 'Error:',e
finally:
source.close()
result.close()
print 'END
'
''' ------------------------------------------------------- '''
#統(tǒng)計關(guān)鍵詞及個數(shù) 并計算相似度
def MergeKeys(dic1,dic2):
#合并關(guān)鍵詞 采用三個數(shù)組實現(xiàn)
arrayKey = []
for i in range(len(dic1)):
arrayKey.append(dic1[i][0]) #向數(shù)組中添加元素
for i in range(len(dic2)):
if dic2[i][0] in arrayKey:
print 'has_key',dic2[i][0]
else: #合并
arrayKey.append(dic2[i][0])
else:
print '
'
test = str(arrayKey).decode('string_escape') #字符轉(zhuǎn)換
print test
#計算詞頻 infobox可忽略TF-IDF
arrayNum1 = [0]*len(arrayKey)
arrayNum2 = [0]*len(arrayKey)
#賦值arrayNum1
for i in range(len(dic1)):
key = dic1[i][0]
value = dic1[i][1]
j = 0
while j < len(arrayKey):
if key == arrayKey[j]:
arrayNum1[j] = value
break
else:
j = j + 1
#賦值arrayNum2
for i in range(len(dic2)):
key = dic2[i][0]
value = dic2[i][1]
j = 0
while j < len(arrayKey):
if key == arrayKey[j]:
arrayNum2[j] = value
break
else:
j = j + 1
print arrayNum1
print arrayNum2
print len(arrayNum1),len(arrayNum2),len(arrayKey)
#計算兩個向量的點積
x = 0
i = 0
while i < len(arrayKey):
x = x + arrayNum1[i] * arrayNum2[i]
i = i + 1
print x
#計算兩個向量的模
i = 0
sq1 = 0
while i < len(arrayKey):
sq1 = sq1 + arrayNum1[i] * arrayNum1[i] #pow(a,2)
i = i + 1
print sq1
i = 0
sq2 = 0
while i < len(arrayKey):
sq2 = sq2 + arrayNum2[i] * arrayNum2[i]
i = i + 1
print sq2
result = float(x) / ( math.sqrt(sq1) * math.sqrt(sq2) )
return result
''' -------------------------------------------------------
基本步驟:
1.分別統(tǒng)計兩個文檔的關(guān)鍵詞
2.兩篇文章的關(guān)鍵詞合并成一個集合,相同的合并,不同的添加
3.計算每篇文章對于這個集合的詞的詞頻 TF-IDF算法計算權(quán)重
4.生成兩篇文章各自的詞頻向量
5.計算兩個向量的余弦相似度,值越大表示越相似
------------------------------------------------------- '''
#主函數(shù)
def main():
#計算文檔1-百度的關(guān)鍵詞及個數(shù)
fileName1 = BaiduSpider.txt
resultName1 = Result_Key_BD.txt
dic1 = CountKey(fileName1, resultName1)
#計算文檔2-互動的關(guān)鍵詞及個數(shù)
fileName2 = HudongSpider\001.txt
resultName2 = HudongSpider\Result_Key_001.txt
dic2 = CountKey(fileName2, resultName2)
#合并兩篇文章的關(guān)鍵詞及相似度計算
result = MergeKeys(dic1, dic2)
print result
if __name__ == '__main__':
main()</key:value>
|