Spaces:

shimizukawa
/

python-no-senpai

Sleeping

shimizukawa commited on Sep 21, 2023

Commit

341f67a

1 Parent(s): c56ab56

2nd modify

Files changed (5) hide show

app.py CHANGED Viewed

@@ -143,9 +143,9 @@ def _get_related_url(metadata) -> Iterable[str]:
         if url in urls:
             continue
         urls.add(url)
-        created_at = datetime.fromtimestamp(m["created_at"])
         # print(m)
-        yield f'<p>URL: <a href="{url}">{url}</a> (created: {created_at:%Y-%m-%d})</p>'
 def _get_query_str_filter(

         if url in urls:
             continue
         urls.add(url)
+        ctime = datetime.fromtimestamp(m["ctime"])
         # print(m)
+        yield f'<p>URL: <a href="{url}">{url}</a> (created: {ctime:%Y-%m-%d})</p>'
 def _get_query_str_filter(

doc_loader.py CHANGED Viewed

@@ -16,19 +16,20 @@ def date_to_int(dt_str: str) -> int:
 def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
     """filename for file with ndjson
-        {"title": <page title>, "body": <page body>, "id": <page_id>, "ctime": ..., "user": <name>, "url": "https:..."}
         {"title": ...}
     """
     with open(filename, "r") as f:
         obj = [json.loads(line) for line in f]
     for data in obj:
         title = data["title"]
-        body = data["body"]
         doc = Doc(
             project_name=project_name,
             id=data["id"],
             title=title,
-            created_at=date_to_int(data["ctime"]),
             user=data["user"],
             url=data["url"],
         )

 def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
     """filename for file with ndjson
+        {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
         {"title": ...}
     """
     with open(filename, "r") as f:
         obj = [json.loads(line) for line in f]
     for data in obj:
         title = data["title"]
+        body = data["content"]
+        ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
         doc = Doc(
             project_name=project_name,
             id=data["id"],
             title=title,
+            ctime=ctime,
             user=data["user"],
             url=data["url"],
         )

model.py CHANGED Viewed

@@ -6,6 +6,6 @@ class Doc:
     project_name: str
     id: int
     title: str
-    created_at: int
     user: str
     url: str

     project_name: str
     id: int
     title: str
+    ctime: int
     user: str
     url: str

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ sentence_transformers
 streamlit
 python-dateutil
 openai

 streamlit
 python-dateutil
 openai
+tqdm

store.py CHANGED Viewed

@@ -1,3 +1,5 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Qdrant
@@ -19,7 +21,7 @@ def get_text_chunk(docs):
 def store(texts):
     model_name = "intfloat/multilingual-e5-large"
-    model_kwargs = {"device": "cuda"}
     encode_kwargs = {"normalize_embeddings": False}
     embeddings = HuggingFaceEmbeddings(
         model_name=model_name,
@@ -27,13 +29,14 @@ def store(texts):
         encode_kwargs=encode_kwargs,
     )
     db_url, db_api_key, db_collection_name = DB_CONFIG
-    _ = Qdrant.from_documents(
-        texts,
-        embeddings,
-        url=db_url,
-        api_key=db_api_key,
-        collection_name=db_collection_name,
-    )
 def main(project_name: str, path: str) -> None:
@@ -52,7 +55,7 @@ if __name__ == "__main__":
     args = sys.argv
     if len(args) != 3:
-        print("No args, you need two args for repo_name, json_file_path")
     else:
         project_name = args[1]
         path = args[2]

+from tqdm import tqdm
+import torch
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.vectorstores import Qdrant
 def store(texts):
     model_name = "intfloat/multilingual-e5-large"
+    model_kwargs = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
     encode_kwargs = {"normalize_embeddings": False}
     embeddings = HuggingFaceEmbeddings(
         model_name=model_name,
         encode_kwargs=encode_kwargs,
     )
     db_url, db_api_key, db_collection_name = DB_CONFIG
+    for text in tqdm(texts):
+        _ = Qdrant.from_documents(
+            [text],
+            embeddings,
+            url=db_url,
+            api_key=db_api_key,
+            collection_name=db_collection_name,
+        )
 def main(project_name: str, path: str) -> None:
     args = sys.argv
     if len(args) != 3:
+        print("No args, you need two args for project_name, json_file_path")
     else:
         project_name = args[1]
         path = args[2]