terapyon commited on
Commit
cd3709a
1 Parent(s): f913bc4

make to store functions

Browse files
Files changed (6) hide show
  1. .gitignore +5 -0
  2. README.md +3 -3
  3. config.py +21 -0
  4. gh_issue_loader.py +67 -0
  5. requirments.txt +10 -0
  6. store.py +59 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .vscode/
2
+ __pycache__/
3
+ venv/
4
+ qdrant_storage/
5
+ data/
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Gh Issue Search
3
  emoji: 🐠
4
  colorFrom: green
5
  colorTo: purple
6
- sdk: gradio
7
- sdk_version: 3.40.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
+ title: Github Issue Search
3
  emoji: 🐠
4
  colorFrom: green
5
  colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.25.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
config.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+
4
+ SAAS = False
5
+
6
+
7
+ def get_db_config():
8
+ url = os.environ["QDRANT_URL"]
9
+ api_key = os.environ["QDRANT_API_KEY"]
10
+ collection_name = "gh-issues"
11
+ return url, api_key, collection_name
12
+
13
+
14
+ def get_local_db_congin():
15
+ url = "localhost"
16
+ # api_key = os.environ["QDRANT_API_KEY"]
17
+ collection_name = "gh-issues"
18
+ return url, None, collection_name
19
+
20
+
21
+ DB_CONFIG = get_db_config() if SAAS else get_local_db_congin()
gh_issue_loader.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, asdict
2
+ import json
3
+ from typing import Iterator
4
+ from dateutil.parser import parse
5
+ from langchain.docstore.document import Document
6
+ from langchain.document_loaders.base import BaseLoader
7
+
8
+
9
+ @dataclass
10
+ class Issue:
11
+ repo_name: str
12
+ id: int
13
+ title: str
14
+ created_at: int
15
+ user: str
16
+ url: str
17
+ labels: list[str]
18
+ type_: str
19
+
20
+
21
+ def date_to_int(dt_str: str) -> int:
22
+ dt = parse(dt_str)
23
+ return int(dt.timestamp())
24
+
25
+
26
+ def get_contents(repo_name: str, filename: str) -> Iterator[tuple[Issue, str]]:
27
+ with open(filename, "r") as f:
28
+ obj = [json.loads(line) for line in f]
29
+ for data in obj:
30
+ issue = Issue(
31
+ repo_name=repo_name,
32
+ id=data["number"],
33
+ title=data["title"],
34
+ created_at=date_to_int(data["created_at"]),
35
+ user=data["user.login"],
36
+ url=data["html_url"],
37
+ labels=data["labels_"],
38
+ type_="issue",
39
+ )
40
+ yield issue, data["body"]
41
+ comments = data["comments_"]
42
+ for comment in comments:
43
+ issue = Issue(
44
+ repo_name=repo_name,
45
+ id=comment["id"],
46
+ title=data["title"],
47
+ created_at=date_to_int(comment["created_at"]),
48
+ user=comment["user.login"],
49
+ url=comment["html_url"],
50
+ labels=data["labels_"],
51
+ type_="comment",
52
+ )
53
+ yield issue, comment["body"]
54
+
55
+
56
+ class GHLoader(BaseLoader):
57
+ def __init__(self, repo_name: str, filename: str):
58
+ self.repo_name = repo_name
59
+ self.filename = filename
60
+
61
+ def lazy_load(self) -> Iterator[Document]:
62
+ for issue, text in get_contents(self.repo_name, self.filename):
63
+ metadata = asdict(issue)
64
+ yield Document(page_content=text, metadata=metadata)
65
+
66
+ def load(self) -> list[Document]:
67
+ return list(self.lazy_load())
requirments.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ tiktoken
3
+ qdrant-client
4
+ torch
5
+ transformers
6
+ accelerate
7
+ bitsandbytes
8
+ sentence_transformers
9
+ streamlit
10
+ python-dateutil
store.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from langchain.embeddings import HuggingFaceEmbeddings
3
+ from langchain.vectorstores import Qdrant
4
+
5
+ from gh_issue_loader import GHLoader
6
+ from config import DB_CONFIG
7
+
8
+
9
+ CHUNK_SIZE = 500
10
+
11
+
12
+ def get_text_chunk(docs):
13
+ text_splitter = RecursiveCharacterTextSplitter(
14
+ chunk_size=CHUNK_SIZE, chunk_overlap=0
15
+ )
16
+ texts = text_splitter.split_documents(docs)
17
+ return texts
18
+
19
+
20
+ def store(texts):
21
+ model_name = "intfloat/multilingual-e5-large"
22
+ model_kwargs = {"device": "cuda"}
23
+ encode_kwargs = {"normalize_embeddings": False}
24
+ embeddings = HuggingFaceEmbeddings(
25
+ model_name=model_name,
26
+ model_kwargs=model_kwargs,
27
+ encode_kwargs=encode_kwargs,
28
+ )
29
+ db_url, db_api_key, db_collection_name = DB_CONFIG
30
+ _ = Qdrant.from_documents(
31
+ texts,
32
+ embeddings,
33
+ url=db_url,
34
+ api_key=db_api_key,
35
+ collection_name=db_collection_name,
36
+ )
37
+
38
+
39
+ def main(repo_name: str, path: str) -> None:
40
+ loader = GHLoader(repo_name, path)
41
+ docs = loader.load()
42
+ texts = get_text_chunk(docs)
43
+ store(texts)
44
+
45
+
46
+ if __name__ == "__main__":
47
+ """
48
+ $ python store.py "REPO_NAME" "FILE_PATH"
49
+ $ python store.py cocoa data/cocoa-issues.json
50
+ """
51
+ import sys
52
+
53
+ args = sys.argv
54
+ if len(args) != 3:
55
+ print("No args, you need two args for repo_name, json_file_path")
56
+ else:
57
+ repo_name = args[1]
58
+ path = args[2]
59
+ main(repo_name, path)