基于python构建搜索引擎系列——(五)推荐阅读

Gypsophila

发布日期: 2019-06-03 15:49:20 浏览量: 806
评分:
star star star star star star star star star_border star_border
*转载请注明来自write-bug.com

虽然主要的检索功能实现了,但是我们还需要一个“推荐阅读”的功能。当用户浏览某条具体新闻时,我们在页面底端给出5条和该新闻相关的新闻,也就是一个最简单的推荐系统。

推荐模块的思路是度量两两新闻之间的相似度,取相似度最高的前5篇新闻作为推荐阅读的新闻。

我们前面讲过,一篇文档可以用一个向量表示,向量中的每个值是不同词项t在该文档d中的词频tf。但是一篇较短的文档(如新闻)的关键词并不多,所以我们可以提取每篇新闻的关键词,用这些关键词的tfidf值构成文档的向量表示,这样能够大大减少相似度计算量,同时保持较好的推荐效果。

jieba分词组件自带关键词提取功能,并能返回关键词的tfidf值。所以对每篇新闻,我们先提取tfidf得分最高的前25个关键词,用这25个关键词的tfidf值作为文档的向量表示。由此能够得到一个1000*m的文档词项矩阵M,矩阵每行表示一个文档,每列表示一个词项,m为1000个文档的所有互异的关键词(大概10000个)。矩阵M当然也是稀疏矩阵。

得到文档词项矩阵M之后,我们利用sklearn的pairwise_distances函数计算M中行向量之间的cosine相似度,对每个文档,得到与其最相似的前5篇新闻id,并把结果写入数据库。

推荐阅读模块的代码如下:

  1. from os import listdir
  2. import xml.etree.ElementTree as ET
  3. import jieba
  4. import jieba.analyse
  5. import sqlite3
  6. import configparser
  7. from datetime import *
  8. import math
  9. import pandas as pd
  10. import numpy as np
  11. from sklearn.metrics import pairwise_distances
  12. class RecommendationModule:
  13. stop_words = set()
  14. k_nearest = []
  15. config_path = ''
  16. config_encoding = ''
  17. doc_dir_path = ''
  18. doc_encoding = ''
  19. stop_words_path = ''
  20. stop_words_encoding = ''
  21. idf_path = ''
  22. db_path = ''
  23. def __init__(self, config_path, config_encoding):
  24. self.config_path = config_path
  25. self.config_encoding = config_encoding
  26. config = configparser.ConfigParser()
  27. config.read(config_path, config_encoding)
  28. self.doc_dir_path = config['DEFAULT']['doc_dir_path']
  29. self.doc_encoding = config['DEFAULT']['doc_encoding']
  30. self.stop_words_path = config['DEFAULT']['stop_words_path']
  31. self.stop_words_encoding = config['DEFAULT']['stop_words_encoding']
  32. self.idf_path = config['DEFAULT']['idf_path']
  33. self.db_path = config['DEFAULT']['db_path']
  34. f = open(self.stop_words_path, encoding = self.stop_words_encoding)
  35. words = f.read()
  36. self.stop_words = set(words.split('\n'))
  37. def write_k_nearest_matrix_to_db(self):
  38. conn = sqlite3.connect(self.db_path)
  39. c = conn.cursor()
  40. c.execute('''DROP TABLE IF EXISTS knearest''')
  41. c.execute('''CREATE TABLE knearest
  42. (id INTEGER PRIMARY KEY, first INTEGER, second INTEGER,
  43. third INTEGER, fourth INTEGER, fifth INTEGER)''')
  44. for docid, doclist in self.k_nearest:
  45. c.execute("INSERT INTO knearest VALUES (?, ?, ?, ?, ?, ?)", tuple([docid] + doclist))
  46. conn.commit()
  47. conn.close()
  48. def is_number(self, s):
  49. try:
  50. float(s)
  51. return True
  52. except ValueError:
  53. return False
  54. def construct_dt_matrix(self, files, topK = 200):
  55. jieba.analyse.set_stop_words(self.stop_words_path)
  56. jieba.analyse.set_idf_path(self.idf_path)
  57. M = len(files)
  58. N = 1
  59. terms = {}
  60. dt = []
  61. for i in files:
  62. root = ET.parse(self.doc_dir_path + i).getroot()
  63. title = root.find('title').text
  64. body = root.find('body').text
  65. docid = int(root.find('id').text)
  66. tags = jieba.analyse.extract_tags(title + '。' + body, topK=topK, withWeight=True)
  67. #tags = jieba.analyse.extract_tags(title, topK=topK, withWeight=True)
  68. cleaned_dict = {}
  69. for word, tfidf in tags:
  70. word = word.strip().lower()
  71. if word == '' or self.is_number(word):
  72. continue
  73. cleaned_dict[word] = tfidf
  74. if word not in terms:
  75. terms[word] = N
  76. N += 1
  77. dt.append([docid, cleaned_dict])
  78. dt_matrix = [[0 for i in range(N)] for j in range(M)]
  79. i =0
  80. for docid, t_tfidf in dt:
  81. dt_matrix[i][0] = docid
  82. for term, tfidf in t_tfidf.items():
  83. dt_matrix[i][terms[term]] = tfidf
  84. i += 1
  85. dt_matrix = pd.DataFrame(dt_matrix)
  86. dt_matrix.index = dt_matrix[0]
  87. print('dt_matrix shape:(%d %d)'%(dt_matrix.shape))
  88. return dt_matrix
  89. def construct_k_nearest_matrix(self, dt_matrix, k):
  90. tmp = np.array(1 - pairwise_distances(dt_matrix[dt_matrix.columns[1:]], metric = "cosine"))
  91. similarity_matrix = pd.DataFrame(tmp, index = dt_matrix.index.tolist(), columns = dt_matrix.index.tolist())
  92. for i in similarity_matrix.index:
  93. tmp = [int(i),[]]
  94. j = 0
  95. while j < k:
  96. max_col = similarity_matrix.loc[i].idxmax(axis = 1)
  97. similarity_matrix.loc[i][max_col] = -1
  98. if max_col != i:
  99. tmp[1].append(int(max_col)) #max column name
  100. j += 1
  101. self.k_nearest.append(tmp)
  102. def gen_idf_file(self):
  103. files = listdir(self.doc_dir_path)
  104. n = float(len(files))
  105. idf = {}
  106. for i in files:
  107. root = ET.parse(self.doc_dir_path + i).getroot()
  108. title = root.find('title').text
  109. body = root.find('body').text
  110. seg_list = jieba.lcut(title + '。' + body, cut_all=False)
  111. seg_list = set(seg_list) - self.stop_words
  112. for word in seg_list:
  113. word = word.strip().lower()
  114. if word == '' or self.is_number(word):
  115. continue
  116. if word not in idf:
  117. idf[word] = 1
  118. else:
  119. idf[word] = idf[word] + 1
  120. idf_file = open(self.idf_path, 'w', encoding = 'utf-8')
  121. for word, df in idf.items():
  122. idf_file.write('%s %.9f\n'%(word, math.log(n / df)))
  123. idf_file.close()
  124. def find_k_nearest(self, k, topK):
  125. self.gen_idf_file()
  126. files = listdir(self.doc_dir_path)
  127. dt_matrix = self.construct_dt_matrix(files, topK)
  128. self.construct_k_nearest_matrix(dt_matrix, k)
  129. self.write_k_nearest_matrix_to_db()
  130. if __name__ == "__main__":
  131. print('-----start time: %s-----'%(datetime.today()))
  132. rm = RecommendationModule('../config.ini', 'utf-8')
  133. rm.find_k_nearest(5, 25)
  134. print('-----finish time: %s-----'%(datetime.today()))

这个模块的代码量最多,主要原因是需要构建文档词项矩阵,并且计算k邻居矩阵。矩阵数据结构的设计需要特别注意,否则会严重影响系统的效率。我刚开始把任务都扔给了pandas.DataFrame,后来发现当两个文档向量合并时,需要join连接操作,当数据量很大时,非常耗时,所以改成了先用python原始的list存储,最后一次性构造一个完整的pandas.DataFrame,速度比之前快了不少。

本文转载自:http://bitjoy.net/2016/01/09/introduction-to-building-a-search-engine-5

上传的附件 cloud_download 新闻搜索引擎.7z ( 3.33mb, 2次下载 )

发送私信

似是而非的对白,诉说着我和你的爱

23
文章数
19
评论数
eject