Spaces:
Runtime error
Runtime error
File size: 3,766 Bytes
6541245 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
from typing import List
from .config import BaselineConfig, Configuration
from ..utils import __create_model__
# import numpy as np
from sklearn.cluster import KMeans
# from yellowbrick.cluster import KElbowVisualizer
from .clusters import ClusterList
class ClusterPipeline:
def __init__(self, config:Configuration = None):
if config is None:
self.__setup__(BaselineConfig())
else:
self.__setup__(config)
def __setup__(self, config:Configuration):
self.PTM = __create_model__(config.plm)
self.dimension_reduction = __create_model__(config.dimension_reduction) # TODO
self.clustering = __create_model__(config.clustering)
self.keywords_extraction = __create_model__(config.keywords_extraction)
def __1_generate_word_embeddings__(self, documents: List[str]):
'''
:param documents: a list of N strings:
:return: np.ndarray: Nx384 (sentence-transformers)
'''
print(f'>>> start generating word embeddings...')
print(f'>>> successfully generated word embeddings...')
return self.PTM.encode(documents)
def __2_dimenstion_reduction__(self, embeddings):
'''
:param embeddings: NxD
:return: Nxd, d<<D
'''
if self.dimension_reduction is None:
return embeddings
print(f'>>> start dimension reduction...')
print(f'>>> finished dimension reduction...')
def __3_clustering__(self, embeddings, return_cluster_centers = False, best_k: int = 5):
'''
:param embeddings: Nxd
:return:
'''
if self.clustering is None:
return embeddings
else:
print(f'>>> start clustering...')
model = KMeans()
# visualizer = KElbowVisualizer(
# model, k=(2, 12), metric='calinski_harabasz', timings=False, locate_elbow=False
# )
#
# visualizer.fit(embeddings)
# best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))]
# print(f'>>> The best K is {best_k}.')
labels, cluster_centers = self.clustering(embeddings, k=best_k)
clusters = ClusterList(best_k)
clusters.instantiate(labels)
print(f'>>> finished clustering...')
if return_cluster_centers:
return clusters, cluster_centers
return clusters
def __4_keywords_extraction__(self, clusters: ClusterList, documents: List[str]):
'''
:param clusters: N documents
:return: clusters, where each cluster has added keyphrases
'''
if self.keywords_extraction is None:
return clusters
else:
print(f'>>> start keywords extraction')
for cluster in clusters:
doc_ids = cluster.elements()
input_abstracts = [documents[i] for i in doc_ids] #[str]
keyphrases = self.keywords_extraction(input_abstracts) #[{keys...}]
cluster.add_keyphrase(keyphrases)
# for doc_id in doc_ids:
# keyphrases = self.keywords_extraction(documents[doc_id])
# cluster.add_keyphrase(keyphrases)
print(f'>>> finished keywords extraction')
return clusters
def __call__(self, documents: List[str], best_k:int = 5):
print(f'>>> pipeline starts...')
x = self.__1_generate_word_embeddings__(documents)
x = self.__2_dimenstion_reduction__(x)
clusters = self.__3_clustering__(x,best_k=best_k)
outputs = self.__4_keywords_extraction__(clusters, documents)
print(f'>>> pipeline finished!\n')
return outputs
|