Spaces:
Runtime error
Runtime error
from typing import List | |
from .config import BaselineConfig, Configuration | |
from ..utils import __create_model__ | |
# import numpy as np | |
from sklearn.cluster import KMeans | |
# from yellowbrick.cluster import KElbowVisualizer | |
from .clusters import ClusterList | |
class ClusterPipeline: | |
def __init__(self, config:Configuration = None): | |
if config is None: | |
self.__setup__(BaselineConfig()) | |
else: | |
self.__setup__(config) | |
def __setup__(self, config:Configuration): | |
self.PTM = __create_model__(config.plm) | |
self.dimension_reduction = __create_model__(config.dimension_reduction) # TODO | |
self.clustering = __create_model__(config.clustering) | |
self.keywords_extraction = __create_model__(config.keywords_extraction) | |
def __1_generate_word_embeddings__(self, documents: List[str]): | |
''' | |
:param documents: a list of N strings: | |
:return: np.ndarray: Nx384 (sentence-transformers) | |
''' | |
print(f'>>> start generating word embeddings...') | |
print(f'>>> successfully generated word embeddings...') | |
return self.PTM.encode(documents) | |
def __2_dimenstion_reduction__(self, embeddings): | |
''' | |
:param embeddings: NxD | |
:return: Nxd, d<<D | |
''' | |
if self.dimension_reduction is None: | |
return embeddings | |
print(f'>>> start dimension reduction...') | |
print(f'>>> finished dimension reduction...') | |
def __3_clustering__(self, embeddings, return_cluster_centers = False, best_k: int = 5): | |
''' | |
:param embeddings: Nxd | |
:return: | |
''' | |
if self.clustering is None: | |
return embeddings | |
else: | |
print(f'>>> start clustering...') | |
model = KMeans() | |
# visualizer = KElbowVisualizer( | |
# model, k=(2, 12), metric='calinski_harabasz', timings=False, locate_elbow=False | |
# ) | |
# | |
# visualizer.fit(embeddings) | |
# best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))] | |
# print(f'>>> The best K is {best_k}.') | |
labels, cluster_centers = self.clustering(embeddings, k=best_k) | |
clusters = ClusterList(best_k) | |
clusters.instantiate(labels) | |
print(f'>>> finished clustering...') | |
if return_cluster_centers: | |
return clusters, cluster_centers | |
return clusters | |
def __4_keywords_extraction__(self, clusters: ClusterList, documents: List[str]): | |
''' | |
:param clusters: N documents | |
:return: clusters, where each cluster has added keyphrases | |
''' | |
if self.keywords_extraction is None: | |
return clusters | |
else: | |
print(f'>>> start keywords extraction') | |
for cluster in clusters: | |
doc_ids = cluster.elements() | |
input_abstracts = [documents[i] for i in doc_ids] #[str] | |
keyphrases = self.keywords_extraction(input_abstracts) #[{keys...}] | |
cluster.add_keyphrase(keyphrases) | |
# for doc_id in doc_ids: | |
# keyphrases = self.keywords_extraction(documents[doc_id]) | |
# cluster.add_keyphrase(keyphrases) | |
print(f'>>> finished keywords extraction') | |
return clusters | |
def __call__(self, documents: List[str], best_k:int = 5): | |
print(f'>>> pipeline starts...') | |
x = self.__1_generate_word_embeddings__(documents) | |
x = self.__2_dimenstion_reduction__(x) | |
clusters = self.__3_clustering__(x,best_k=best_k) | |
outputs = self.__4_keywords_extraction__(clusters, documents) | |
print(f'>>> pipeline finished!\n') | |
return outputs | |