1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
| from scipy.cluster.hierarchy import linkage, fcluster from scipy.spatial.distance import pdist, squareform import Levenshtein import numpy as np import pandas as pd
class EditDistanceClustering: def __init__(self, input_csv: str, output_csv: str, column_name: str, threshold: int = 5): """ 初始化类 :param input_csv: 输入CSV文件路径 :param output_csv: 输出CSV文件路径 :param column_name: 需要聚类的列名 :param threshold: 距离阈值,决定簇的划分 """ self.input_csv: str = input_csv self.output_csv: str = output_csv self.column_name: str = column_name self.threshold: int = threshold self.df: pd.DataFrame | None = None def read_csv(self) -> None: """读取CSV文件到DataFrame中""" self.df = pd.read_csv(self.input_csv) def lev_distance(self, s1: str, s2: str) -> int: """ 计算两个字符串之间的Levenshtein编辑距离 :param s1: 第一个字符串 :param s2: 第二个字符串 :return: 编辑距离 """ return Levenshtein.distance(s1, s2)
def compute_edit_distance_matrix(self) -> np.ndarray: """ 计算字符串列表的编辑距离矩阵 :return: 计算得到的编辑距离矩阵(二维数组) """ strings: list[str] = self.df[self.column_name].astype(str).tolist() strings_array: np.ndarray = np.array(strings).reshape(-1, 1) return squareform(pdist(strings_array, lambda u, v: self.lev_distance(u[0], v[0]))) def cluster_and_sort(self) -> None: """ 执行层次聚类,并根据聚类结果排序 """ distance_matrix: np.ndarray = self.compute_edit_distance_matrix() linkage_matrix: np.ndarray = linkage(distance_matrix, method='ward') clusters: np.ndarray = fcluster(linkage_matrix, self.threshold, criterion='distance') self.df['Cluster'] = clusters self.df = self.df.sort_values(by=['Cluster']).drop(columns=['Cluster']) def save_csv(self) -> None: """将处理后的DataFrame保存回CSV文件""" self.df.to_csv(self.output_csv, index=False, encoding='utf-8-sig') def run(self) -> None: """ 运行完整的数据处理流程,包括读取CSV、聚类排序、保存CSV """ self.read_csv() self.cluster_and_sort() self.save_csv()
if __name__ == "__main__": clustering: EditDistanceClustering = EditDistanceClustering( 'data/second_fine_grained_tasks_new.csv', 'data/second_fine_grained_tasks_new_sorted1.csv', '细粒度_任务相关查询' ) clustering.run()
|