1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
| import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.cluster import DBSCAN from collections import defaultdict from scipy.cluster.hierarchy import linkage, fcluster
class PhraseClustering: def __init__(self, model_name="thenlper/gte-base-zh", eps=0.3, min_samples=2): """ 初始化 BERT 模型和 DBSCAN 参数 :param model_name: 预训练 BERT 模型名称(默认使用轻量级 SBERT) :param eps: DBSCAN 聚类的半径参数 :param min_samples: DBSCAN 的最小样本数 """ self.model = SentenceTransformer(model_name) self.eps = eps self.min_samples = min_samples
def load_data(self, file_path): """ 读取 CSV 文件中的短语 :param file_path: CSV 文件路径 :return: 短语列表 """ df = pd.read_csv(file_path) if "细粒度_任务相关查询" not in df.columns: raise ValueError("CSV 文件必须包含 'text' 列") return df["细粒度_任务相关查询"].dropna().tolist()
def encode_phrases(self, phrases): """ 使用 BERT 编码短语 :param phrases: 短语列表 :return: 短语的向量表示(NumPy 数组) """ embeddings = self.model.encode(phrases, convert_to_numpy=True) return embeddings
def cluster_phrases(self, embeddings): """ 使用层次聚类进行聚类 :param embeddings: 短语的向量表示 :return: 聚类标签 """ Z = linkage(1 - np.inner(embeddings, embeddings), method='average') labels = fcluster(Z, t=self.eps, criterion='distance') return labels
def group_clusters(self, phrases, labels): """ 根据 DBSCAN 结果将短语分组 :param phrases: 原始短语列表 :param labels: 聚类标签 :return: 以字典形式存储的聚类结果 {类别ID: 短语列表} """ cluster_dict = defaultdict(list) for phrase, label in zip(phrases, labels): cluster_dict[label].append(phrase) return cluster_dict
def save_clusters(self, cluster_dict, output_path): """ 将聚类结果保存到 CSV 文件,每一列对应一个类别 :param cluster_dict: 聚类后的短语字典 :param output_path: 输出 CSV 文件路径 """ filtered_clusters = {k: v for k, v in cluster_dict.items() if k != -1}
max_len = max(len(v) for v in filtered_clusters.values()) cluster_columns = [v + [""] * (max_len - len(v)) for v in filtered_clusters.values()]
df = pd.DataFrame(cluster_columns).T df.to_csv(output_path, index=False, encoding="utf-8-sig")
def run(self, input_csv, output_csv): """ 完整执行流程:加载数据 -> 计算向量 -> 聚类 -> 保存结果 :param input_csv: 输入 CSV 文件路径 :param output_csv: 输出 CSV 文件路径 """ print("1. 加载数据...") phrases = self.load_data(input_csv) print("2. 计算短语向量...") embeddings = self.encode_phrases(phrases)
print("3. 进行 层次 聚类...") labels = self.cluster_phrases(embeddings)
print("4. 整理聚类结果...") cluster_dict = self.group_clusters(phrases, labels)
print(f"5. 发现 {len(cluster_dict)} 个类别,保存到 {output_csv} ...") self.save_clusters(cluster_dict, output_csv) print("✅ 处理完成!")
if __name__ == "__main__": input_file = "taskPhraseClustering/second_fine_grained_tasks_new_sorted.csv" output_file = "taskPhraseClustering/output_clusters.csv" clustering = PhraseClustering(eps=3, min_samples=2) clustering.run(input_file, output_file)
|