Source code for pyunit_newword.words

#!/usr/bin/python3.7
# -*- coding: utf-8 -*-
# @Time  : 2018/2/28 10:58
# @Author: Jtyoui@qq.com
from tqdm import tqdm
import re
import math


[docs]class NewWords: def __init__(self, max_split=5, filter_cond=None, filter_free=None): """初始化 :param max_split: 最大候选词长度,限制长度为 n-gram :param filter_cond: 过滤凝聚度,默认None为自动寻找 :param filter_free: 过滤自由度,默认None为自动寻找 """ self.vocab = {} self.max_split = max_split self.all_words_len = 0 self.cond = filter_cond self.free = filter_free
[docs] def add_text(self, file, encoding='UTF-8'): """读取文本数据内容 统计:[关键字次数,关键字频率,关键字的左邻,关键字的右邻] :param file: 文件文本路径 :param encoding: 文本格式 """ with open(file=file, mode='r', encoding=encoding) as line: for word in tqdm(line.readlines(), desc='读取数据进度条'): words = word.strip() for lines in re.split('[^\u4e00-\u9fa50-9a-zA-Z]', words): match = re.findall(r'[\u4e00-\u9fa50-9]', lines) lens = len(match) self.all_words_len += lens for i in range(lens): for j in range(1, self.max_split + 1): if i + j <= lens: k = ''.join(match[i:i + j]) if k in self.vocab: w = self.vocab[k] else: w = [0, 0, set(), set()] self.vocab[k] = w w[0] += 1 w[1] = w[0] / self.all_words_len if i != 0: w[2].add(match[i - 1]) if i + j != lens: w[3].add(match[i + j]) else: # 候选词的个数大于该句子的长度时立即停止 break
[docs] def analysis_data(self): """分析文本数据 分析:关键词每个片段凝固程度:solid\n 关键字的左邻自由程度:front_all\n 关键字的右邻自由程度:end_all """ for key in tqdm(self.vocab, desc='分析数据进度条'): key_len = len(key) if key_len != 1: attribute: list = self.vocab[key] solid, end_all, front_all = [], 0, 0 for index in range(1, key_len): score = attribute[1] / (self.vocab[key[:index]][1] * self.vocab[key[index:]][1]) solid.append(math.log2(score)) for front in attribute[2]: front_all -= math.log2(self.vocab[front][1]) * self.vocab[front][1] # 左邻字集合自由程度 for end in attribute[3]: end_all -= math.log2(self.vocab[end][1]) * self.vocab[end][1] # 右邻字集合自由程度 attribute.append(min(solid)) attribute.append(min(end_all, front_all))
def _filter_algorithm(self, x): """自动筛选算法 自动寻找筛选过滤参数值\n attribute:[出现次数,出现频率,关键字的左邻,关键字的右邻,凝固程度,自由程度] ::param x: x为候选词属性 """ if len(x[0]) == 1: return False attribute: list = x[1] if attribute[4] <= 0.1: return False elif len(attribute[2]) == len(attribute[3]) == 0 and attribute[0] > 2: return True elif attribute[0] > 100 and len(attribute[2]) >= attribute[0] * 0.1 and len(attribute[3]) >= attribute[0] * 0.1: return True elif attribute[4] >= self.cond and attribute[5] >= self.free: return True return False
[docs] def get_words(self): """新词筛选""" clean_text = filter(self._filter_algorithm, self.vocab.items()) return clean_text