shimizukawa commited on
Commit
341f67a
1 Parent(s): c56ab56

2nd modify

Browse files
Files changed (5) hide show
  1. app.py +2 -2
  2. doc_loader.py +4 -3
  3. model.py +1 -1
  4. requirements.txt +1 -0
  5. store.py +12 -9
app.py CHANGED
@@ -143,9 +143,9 @@ def _get_related_url(metadata) -> Iterable[str]:
143
  if url in urls:
144
  continue
145
  urls.add(url)
146
- created_at = datetime.fromtimestamp(m["created_at"])
147
  # print(m)
148
- yield f'<p>URL: <a href="{url}">{url}</a> (created: {created_at:%Y-%m-%d})</p>'
149
 
150
 
151
  def _get_query_str_filter(
 
143
  if url in urls:
144
  continue
145
  urls.add(url)
146
+ ctime = datetime.fromtimestamp(m["ctime"])
147
  # print(m)
148
+ yield f'<p>URL: <a href="{url}">{url}</a> (created: {ctime:%Y-%m-%d})</p>'
149
 
150
 
151
  def _get_query_str_filter(
doc_loader.py CHANGED
@@ -16,19 +16,20 @@ def date_to_int(dt_str: str) -> int:
16
  def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
17
  """filename for file with ndjson
18
 
19
- {"title": <page title>, "body": <page body>, "id": <page_id>, "ctime": ..., "user": <name>, "url": "https:..."}
20
  {"title": ...}
21
  """
22
  with open(filename, "r") as f:
23
  obj = [json.loads(line) for line in f]
24
  for data in obj:
25
  title = data["title"]
26
- body = data["body"]
 
27
  doc = Doc(
28
  project_name=project_name,
29
  id=data["id"],
30
  title=title,
31
- created_at=date_to_int(data["ctime"]),
32
  user=data["user"],
33
  url=data["url"],
34
  )
 
16
  def get_contents(project_name: str, filename: str) -> Iterator[tuple[Doc, str]]:
17
  """filename for file with ndjson
18
 
19
+ {"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
20
  {"title": ...}
21
  """
22
  with open(filename, "r") as f:
23
  obj = [json.loads(line) for line in f]
24
  for data in obj:
25
  title = data["title"]
26
+ body = data["content"]
27
+ ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
28
  doc = Doc(
29
  project_name=project_name,
30
  id=data["id"],
31
  title=title,
32
+ ctime=ctime,
33
  user=data["user"],
34
  url=data["url"],
35
  )
model.py CHANGED
@@ -6,6 +6,6 @@ class Doc:
6
  project_name: str
7
  id: int
8
  title: str
9
- created_at: int
10
  user: str
11
  url: str
 
6
  project_name: str
7
  id: int
8
  title: str
9
+ ctime: int
10
  user: str
11
  url: str
requirements.txt CHANGED
@@ -9,3 +9,4 @@ sentence_transformers
9
  streamlit
10
  python-dateutil
11
  openai
 
 
9
  streamlit
10
  python-dateutil
11
  openai
12
+ tqdm
store.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from langchain.text_splitter import RecursiveCharacterTextSplitter
2
  from langchain.embeddings import HuggingFaceEmbeddings
3
  from langchain.vectorstores import Qdrant
@@ -19,7 +21,7 @@ def get_text_chunk(docs):
19
 
20
  def store(texts):
21
  model_name = "intfloat/multilingual-e5-large"
22
- model_kwargs = {"device": "cuda"}
23
  encode_kwargs = {"normalize_embeddings": False}
24
  embeddings = HuggingFaceEmbeddings(
25
  model_name=model_name,
@@ -27,13 +29,14 @@ def store(texts):
27
  encode_kwargs=encode_kwargs,
28
  )
29
  db_url, db_api_key, db_collection_name = DB_CONFIG
30
- _ = Qdrant.from_documents(
31
- texts,
32
- embeddings,
33
- url=db_url,
34
- api_key=db_api_key,
35
- collection_name=db_collection_name,
36
- )
 
37
 
38
 
39
  def main(project_name: str, path: str) -> None:
@@ -52,7 +55,7 @@ if __name__ == "__main__":
52
 
53
  args = sys.argv
54
  if len(args) != 3:
55
- print("No args, you need two args for repo_name, json_file_path")
56
  else:
57
  project_name = args[1]
58
  path = args[2]
 
1
+ from tqdm import tqdm
2
+ import torch
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain.vectorstores import Qdrant
 
21
 
22
  def store(texts):
23
  model_name = "intfloat/multilingual-e5-large"
24
+ model_kwargs = {"device": "cuda:0" if torch.cuda.is_available() else "cpu"}
25
  encode_kwargs = {"normalize_embeddings": False}
26
  embeddings = HuggingFaceEmbeddings(
27
  model_name=model_name,
 
29
  encode_kwargs=encode_kwargs,
30
  )
31
  db_url, db_api_key, db_collection_name = DB_CONFIG
32
+ for text in tqdm(texts):
33
+ _ = Qdrant.from_documents(
34
+ [text],
35
+ embeddings,
36
+ url=db_url,
37
+ api_key=db_api_key,
38
+ collection_name=db_collection_name,
39
+ )
40
 
41
 
42
  def main(project_name: str, path: str) -> None:
 
55
 
56
  args = sys.argv
57
  if len(args) != 3:
58
+ print("No args, you need two args for project_name, json_file_path")
59
  else:
60
  project_name = args[1]
61
  path = args[2]