Spaces:
Running
Running
shimizukawa
commited on
Commit
•
8d5b271
1
Parent(s):
c1dc2ee
refactoring: move index annotation
Browse files- loaders/github_issue.py +3 -6
- loaders/rtdhtmlpage.py +1 -3
- loaders/wikipage.py +3 -5
- models.py +1 -1
- store.py +8 -3
loaders/github_issue.py
CHANGED
@@ -15,14 +15,13 @@ def date_to_int(dt_str: str) -> int:
|
|
15 |
return int(dt.timestamp())
|
16 |
|
17 |
|
18 |
-
def get_contents(
|
19 |
with inputfile.open("r") as f:
|
20 |
obj = [json.loads(line) for line in f]
|
21 |
for data in obj:
|
22 |
title = data["title"]
|
23 |
body = data["body"]
|
24 |
issue = GithubIssue(
|
25 |
-
index=index,
|
26 |
id=data["number"],
|
27 |
title=title,
|
28 |
ctime=date_to_int(data["created_at"]),
|
@@ -37,7 +36,6 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str
|
|
37 |
comments = data["comments_"]
|
38 |
for comment in comments:
|
39 |
issue = GithubIssue(
|
40 |
-
index=index,
|
41 |
id=comment["id"],
|
42 |
title=data["title"],
|
43 |
ctime=date_to_int(comment["created_at"]),
|
@@ -50,12 +48,11 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[GithubIssue, str
|
|
50 |
|
51 |
|
52 |
class GithubIssueLoader(BaseLoader):
|
53 |
-
def __init__(self,
|
54 |
-
self.index = index
|
55 |
self.inputfile = inputfile
|
56 |
|
57 |
def lazy_load(self) -> Iterator[Document]:
|
58 |
-
for issue, text in get_contents(self.
|
59 |
metadata = asdict(issue)
|
60 |
yield Document(page_content=text, metadata=metadata)
|
61 |
|
|
|
15 |
return int(dt.timestamp())
|
16 |
|
17 |
|
18 |
+
def get_contents(inputfile: Path) -> Iterator[tuple[GithubIssue, str]]:
|
19 |
with inputfile.open("r") as f:
|
20 |
obj = [json.loads(line) for line in f]
|
21 |
for data in obj:
|
22 |
title = data["title"]
|
23 |
body = data["body"]
|
24 |
issue = GithubIssue(
|
|
|
25 |
id=data["number"],
|
26 |
title=title,
|
27 |
ctime=date_to_int(data["created_at"]),
|
|
|
36 |
comments = data["comments_"]
|
37 |
for comment in comments:
|
38 |
issue = GithubIssue(
|
|
|
39 |
id=comment["id"],
|
40 |
title=data["title"],
|
41 |
ctime=date_to_int(comment["created_at"]),
|
|
|
48 |
|
49 |
|
50 |
class GithubIssueLoader(BaseLoader):
|
51 |
+
def __init__(self, inputfile: Path):
|
|
|
52 |
self.inputfile = inputfile
|
53 |
|
54 |
def lazy_load(self) -> Iterator[Document]:
|
55 |
+
for issue, text in get_contents(self.inputfile):
|
56 |
metadata = asdict(issue)
|
57 |
yield Document(page_content=text, metadata=metadata)
|
58 |
|
loaders/rtdhtmlpage.py
CHANGED
@@ -12,8 +12,7 @@ class RTDHtmlPageLoader(ReadTheDocsLoader):
|
|
12 |
$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
|
13 |
$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
|
14 |
"""
|
15 |
-
def __init__(self,
|
16 |
-
self.index = index
|
17 |
kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
|
18 |
super().__init__(inputfile, *args, **kwargs)
|
19 |
|
@@ -66,7 +65,6 @@ class RTDHtmlPageLoader(ReadTheDocsLoader):
|
|
66 |
"user": "rtd",
|
67 |
"type": "rtd",
|
68 |
"url": f"https://{str(p)}",
|
69 |
-
"index": self.index,
|
70 |
"id": str(p),
|
71 |
}
|
72 |
# print(metadata)
|
|
|
12 |
$ wget -r -np -A.html https://docs.djangoproject.com/en/4.2/
|
13 |
$ python store.py -l rtdhtmlpage django ./docs.djangoproject.com/
|
14 |
"""
|
15 |
+
def __init__(self, inputfile: Path, *args, **kwargs):
|
|
|
16 |
kwargs["custom_html_tag"] = ("div", {"id": "docs-content"})
|
17 |
super().__init__(inputfile, *args, **kwargs)
|
18 |
|
|
|
65 |
"user": "rtd",
|
66 |
"type": "rtd",
|
67 |
"url": f"https://{str(p)}",
|
|
|
68 |
"id": str(p),
|
69 |
}
|
70 |
# print(metadata)
|
loaders/wikipage.py
CHANGED
@@ -15,7 +15,7 @@ def date_to_int(dt_str: str) -> int:
|
|
15 |
return int(dt.timestamp())
|
16 |
|
17 |
|
18 |
-
def get_contents(
|
19 |
"""filename for file with ndjson
|
20 |
|
21 |
{"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
|
@@ -28,7 +28,6 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
|
|
28 |
body = data["content"]
|
29 |
ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
|
30 |
doc = WikiPage(
|
31 |
-
index=index,
|
32 |
id=data["id"],
|
33 |
title=title,
|
34 |
ctime=ctime,
|
@@ -42,12 +41,11 @@ def get_contents(index: str, inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
|
|
42 |
|
43 |
|
44 |
class WikiPageLoader(BaseLoader):
|
45 |
-
def __init__(self,
|
46 |
-
self.index = index
|
47 |
self.inputfile = inputfile
|
48 |
|
49 |
def lazy_load(self) -> Iterator[Document]:
|
50 |
-
for doc, text in get_contents(self.
|
51 |
metadata = asdict(doc)
|
52 |
yield Document(page_content=text, metadata=metadata)
|
53 |
|
|
|
15 |
return int(dt.timestamp())
|
16 |
|
17 |
|
18 |
+
def get_contents(inputfile: Path) -> Iterator[tuple[WikiPage, str]]:
|
19 |
"""filename for file with ndjson
|
20 |
|
21 |
{"id": <page_id>, "title": <page title>, "content": <page body>, "ctime": ..., "user": <name>, "url": "https:..."}
|
|
|
28 |
body = data["content"]
|
29 |
ctime = date_to_int(data["ctime"]) if isinstance(data["ctime"], str) else data["ctime"]
|
30 |
doc = WikiPage(
|
|
|
31 |
id=data["id"],
|
32 |
title=title,
|
33 |
ctime=ctime,
|
|
|
41 |
|
42 |
|
43 |
class WikiPageLoader(BaseLoader):
|
44 |
+
def __init__(self, inputfile: Path):
|
|
|
45 |
self.inputfile = inputfile
|
46 |
|
47 |
def lazy_load(self) -> Iterator[Document]:
|
48 |
+
for doc, text in get_contents(self.inputfile):
|
49 |
metadata = asdict(doc)
|
50 |
yield Document(page_content=text, metadata=metadata)
|
51 |
|
models.py
CHANGED
@@ -3,13 +3,13 @@ import dataclasses
|
|
3 |
|
4 |
@dataclasses.dataclass(frozen=True)
|
5 |
class BaseModel:
|
6 |
-
index: str
|
7 |
id: int
|
8 |
title: str
|
9 |
ctime: int
|
10 |
user: str
|
11 |
url: str
|
12 |
type: str
|
|
|
13 |
|
14 |
|
15 |
@dataclasses.dataclass(frozen=True)
|
|
|
3 |
|
4 |
@dataclasses.dataclass(frozen=True)
|
5 |
class BaseModel:
|
|
|
6 |
id: int
|
7 |
title: str
|
8 |
ctime: int
|
9 |
user: str
|
10 |
url: str
|
11 |
type: str
|
12 |
+
index: str = ""
|
13 |
|
14 |
|
15 |
@dataclasses.dataclass(frozen=True)
|
store.py
CHANGED
@@ -61,6 +61,12 @@ def get_parser():
|
|
61 |
return p
|
62 |
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def main():
|
65 |
"""
|
66 |
$ python store.py --loader wikipage "index" "FILE_PATH"
|
@@ -71,12 +77,11 @@ def main():
|
|
71 |
args = p.parse_args()
|
72 |
loader = get_loader(
|
73 |
args.loader,
|
74 |
-
index=args.index,
|
75 |
inputfile=Path(args.inputfile),
|
76 |
)
|
77 |
|
78 |
-
docs = loader.
|
79 |
-
texts = get_text_chunk(docs)
|
80 |
store(texts)
|
81 |
|
82 |
|
|
|
61 |
return p
|
62 |
|
63 |
|
64 |
+
def index_annotated_docs(docs, index):
|
65 |
+
for doc in docs:
|
66 |
+
doc.metadata["index"] = index
|
67 |
+
yield doc
|
68 |
+
|
69 |
+
|
70 |
def main():
|
71 |
"""
|
72 |
$ python store.py --loader wikipage "index" "FILE_PATH"
|
|
|
77 |
args = p.parse_args()
|
78 |
loader = get_loader(
|
79 |
args.loader,
|
|
|
80 |
inputfile=Path(args.inputfile),
|
81 |
)
|
82 |
|
83 |
+
docs = loader.lazy_load()
|
84 |
+
texts = get_text_chunk(index_annotated_docs(docs, args.index))
|
85 |
store(texts)
|
86 |
|
87 |
|