Spaces:
Running
Running
shimizukawa
commited on
Commit
•
341f67a
1
Parent(s):
c56ab56
2nd modify
Browse files- app.py +2 -2
- doc_loader.py +4 -3
- model.py +1 -1
- requirements.txt +1 -0
- store.py +12 -9
app.py
CHANGED
@@ -143,9 +143,9 @@ def _get_related_url(metadata) -> Iterable[str]:
|
|
143 |
if url in urls:
|
144 |
continue
|
145 |
urls.add(url)
|
146 |
-
|
147 |
# print(m)
|
148 |
-
yield f'<p>URL: <a href="{url}">{url}</a> (created: {
|
149 |
|
150 |
|
151 |
def _get_query_str_filter(
|
|
|
143 |
if url in urls:
|
144 |
continue
|
145 |
urls.add(url)
|
146 |
+
ctime = datetime.fromtimestamp(m["ctime"])
|
147 |
# print(m)
|
148 |
+
yield f'<p>URL: <a href="{url}">{url}</a> (created: {ctime:%Y-%m-%d})</p>'
|
149 |
|
150 |
|
151 |
def _get_query_str_filter(
|
doc_loader.py
CHANGED
@@ -16,19 +16,20 @@ def date_to_int(dt_str: str) -> int:
|
|
16 |
def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
|
17 |
"""filename for file with ndjson
|
18 |
|
19 |
-
{"
|
20 |
{"title": ...}
|
21 |
"""
|
22 |
with open(filename, "r") as f:
|
23 |
obj = [json.loads(line) for line in f]
|
24 |
for data in obj:
|
25 |
title = data["title"]
|
26 |
-
body = data["
|
|
|
27 |
doc = Doc(
|
28 |
project_name=project_name,
|
29 |
id=data["id"],
|
30 |
title=title,
|
31 |
-
|
32 |
user=data["user"],
|
33 |
url=data["url"],
|
34 |
)
|
|
|
16 |
def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
|
17 |
"""filename for file with ndjson
|
18 |
|
19 |
+
{"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
|
20 |
{"title": ...}
|
21 |
"""
|
22 |
with open(filename, "r") as f:
|
23 |
obj = [json.loads(line) for line in f]
|
24 |
for data in obj:
|
25 |
title = data["title"]
|
26 |
+
body = data["content"]
|
27 |
+
ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
|
28 |
doc = Doc(
|
29 |
project_name=project_name,
|
30 |
id=data["id"],
|
31 |
title=title,
|
32 |
+
ctime=ctime,
|
33 |
user=data["user"],
|
34 |
url=data["url"],
|
35 |
)
|
model.py
CHANGED
@@ -6,6 +6,6 @@ class Doc:
|
|
6 |
project_name: str
|
7 |
id: int
|
8 |
title: str
|
9 |
-
|
10 |
user: str
|
11 |
url: str
|
|
|
6 |
project_name: str
|
7 |
id: int
|
8 |
title: str
|
9 |
+
ctime: int
|
10 |
user: str
|
11 |
url: str
|
requirements.txt
CHANGED
@@ -9,3 +9,4 @@ sentence_transformers
|
|
9 |
streamlit
|
10 |
python-dateutil
|
11 |
openai
|
|
|
|
9 |
streamlit
|
10 |
python-dateutil
|
11 |
openai
|
12 |
+
tqdm
|
store.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
2 |
from langchain.embeddings import HuggingFaceEmbeddings
|
3 |
from langchain.vectorstores import Qdrant
|
@@ -19,7 +21,7 @@ def get_text_chunk(docs):
|
|
19 |
|
20 |
def store(texts):
|
21 |
model_name = "intfloat/multilingual-e5-large"
|
22 |
-
model_kwargs = {"device": "cuda"}
|
23 |
encode_kwargs = {"normalize_embeddings": False}
|
24 |
embeddings = HuggingFaceEmbeddings(
|
25 |
model_name=model_name,
|
@@ -27,13 +29,14 @@ def store(texts):
|
|
27 |
encode_kwargs=encode_kwargs,
|
28 |
)
|
29 |
db_url, db_api_key, db_collection_name = DB_CONFIG
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
37 |
|
38 |
|
39 |
def main(project_name: str, path: str) -> None:
|
@@ -52,7 +55,7 @@ if __name__ == "__main__":
|
|
52 |
|
53 |
args = sys.argv
|
54 |
if len(args) != 3:
|
55 |
-
print("No args, you need two args for
|
56 |
else:
|
57 |
project_name = args[1]
|
58 |
path = args[2]
|
|
|
1 |
+
from tqdm import tqdm
|
2 |
+
import torch
|
3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.vectorstores import Qdrant
|
|
|
21 |
|
22 |
def store(texts):
|
23 |
model_name = "intfloat/multilingual-e5-large"
|
24 |
+
model_kwargs = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
|
25 |
encode_kwargs = {"normalize_embeddings": False}
|
26 |
embeddings = HuggingFaceEmbeddings(
|
27 |
model_name=model_name,
|
|
|
29 |
encode_kwargs=encode_kwargs,
|
30 |
)
|
31 |
db_url, db_api_key, db_collection_name = DB_CONFIG
|
32 |
+
for text in tqdm(texts):
|
33 |
+
_ = Qdrant.from_documents(
|
34 |
+
[text],
|
35 |
+
embeddings,
|
36 |
+
url=db_url,
|
37 |
+
api_key=db_api_key,
|
38 |
+
collection_name=db_collection_name,
|
39 |
+
)
|
40 |
|
41 |
|
42 |
def main(project_name: str, path: str) -> None:
|
|
|
55 |
|
56 |
args = sys.argv
|
57 |
if len(args) != 3:
|
58 |
+
print("No args, you need two args for project_name, json_file_path")
|
59 |
else:
|
60 |
project_name = args[1]
|
61 |
path = args[2]
|