Spaces:
Running
on
T4
Running
on
T4
Removing stop words but just for english
Browse files
app.py
CHANGED
@@ -22,6 +22,7 @@ from transformers import (
|
|
22 |
from prompts import system_prompt, example_prompt, main_prompt
|
23 |
from umap import UMAP
|
24 |
from hdbscan import HDBSCAN
|
|
|
25 |
|
26 |
# from cuml.cluster import HDBSCAN
|
27 |
# from cuml.manifold import UMAP
|
@@ -36,7 +37,7 @@ session = requests.Session()
|
|
36 |
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
37 |
keybert = KeyBERTInspired()
|
38 |
mmr = MaximalMarginalRelevance(diversity=0.3)
|
39 |
-
|
40 |
|
41 |
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
42 |
device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"
|
@@ -125,6 +126,7 @@ def fit_model(base_model, docs, embeddings):
|
|
125 |
umap_model=umap_model,
|
126 |
hdbscan_model=hdbscan_model,
|
127 |
representation_model=representation_model,
|
|
|
128 |
# Hyperparameters
|
129 |
top_n_words=10,
|
130 |
verbose=True,
|
|
|
22 |
from prompts import system_prompt, example_prompt, main_prompt
|
23 |
from umap import UMAP
|
24 |
from hdbscan import HDBSCAN
|
25 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
26 |
|
27 |
# from cuml.cluster import HDBSCAN
|
28 |
# from cuml.manifold import UMAP
|
|
|
37 |
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
|
38 |
keybert = KeyBERTInspired()
|
39 |
mmr = MaximalMarginalRelevance(diversity=0.3)
|
40 |
+
vectorizer_model = CountVectorizer(stop_words="english")
|
41 |
|
42 |
model_id = "meta-llama/Llama-2-7b-chat-hf"
|
43 |
device = f"cuda:{cuda.current_device()}" if cuda.is_available() else "cpu"
|
|
|
126 |
umap_model=umap_model,
|
127 |
hdbscan_model=hdbscan_model,
|
128 |
representation_model=representation_model,
|
129 |
+
vectorizer_model=vectorizer_model,
|
130 |
# Hyperparameters
|
131 |
top_n_words=10,
|
132 |
verbose=True,
|