LOUIS SANNA commited on
Commit
d6936f0
·
1 Parent(s): 780c913

feat(data): add analec

Browse files
README.md CHANGED
@@ -18,5 +18,5 @@ We abstracted the code so it's easy to build another tool based on another domai
18
  ## Build vector index
19
 
20
  ```bash
21
- python -m climateqa.build_index
22
  ```
 
18
  ## Build vector index
19
 
20
  ```bash
21
+ python -m anyqa.build_index
22
  ```
anyqa/build_index.py CHANGED
@@ -10,14 +10,18 @@ from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
10
 
11
 
12
  def load_data():
 
13
  docs = parse_data()
 
14
  embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
 
15
  vectorstore = get_vectorstore(embedding_function)
16
 
17
  assert isinstance(vectorstore, Chroma)
18
  vectorstore.from_documents(
19
  docs, embedding_function, persist_directory=PERSIST_DIRECTORY
20
  )
 
21
  return vectorstore
22
 
23
 
@@ -47,7 +51,7 @@ def parse_data():
47
 
48
 
49
  def parse_name(source: str) -> str:
50
- return source.split("/")[-1].split(".")[0]
51
 
52
 
53
  def parse_domain(source: str) -> str:
 
10
 
11
 
12
  def load_data():
13
+ print("Loading data...")
14
  docs = parse_data()
15
+ print("Loaded documents")
16
  embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
17
+ print("Building index...")
18
  vectorstore = get_vectorstore(embedding_function)
19
 
20
  assert isinstance(vectorstore, Chroma)
21
  vectorstore.from_documents(
22
  docs, embedding_function, persist_directory=PERSIST_DIRECTORY
23
  )
24
+ print("Index built")
25
  return vectorstore
26
 
27
 
 
51
 
52
 
53
  def parse_name(source: str) -> str:
54
+ return source.split("/")[-1].split(".")[0].replace("_", " ")
55
 
56
 
57
  def parse_domain(source: str) -> str:
chroma_db/{13934663-2db5-404d-be0f-51734d442e08 → 1730b83a-f75a-41e2-aba7-637881bb5ea8}/data_level0.bin RENAMED
File without changes
chroma_db/{13934663-2db5-404d-be0f-51734d442e08 → 1730b83a-f75a-41e2-aba7-637881bb5ea8}/header.bin RENAMED
File without changes
chroma_db/{13934663-2db5-404d-be0f-51734d442e08 → 1730b83a-f75a-41e2-aba7-637881bb5ea8}/length.bin RENAMED
File without changes
chroma_db/{13934663-2db5-404d-be0f-51734d442e08 → 1730b83a-f75a-41e2-aba7-637881bb5ea8}/link_lists.bin RENAMED
File without changes
chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/data_level0.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
3
- size 3212000
 
 
 
 
chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/header.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
3
- size 100
 
 
 
 
chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/length.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
3
- size 4000
 
 
 
 
chroma_db/5fa47764-2449-49fb-ae2f-0fd1886dfa2d/link_lists.bin DELETED
File without changes
chroma_db/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dc2c64a9de7507097ab452fdce23fc6348f38e0d34484d791a8c43366b78001
3
- size 2564096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d627997dd35604ac27e67f35911999f234285c39362fffecddd50621d9f01d77
3
+ size 4067328
constitution.pdf DELETED
Binary file (414 kB)
 
data/Confucianism/Analects of Confucius.pdf ADDED
Binary file (711 kB). View file
 
data/{daoism/tao-te-ching.pdf → Daoism/Tao_Te_Ching.pdf} RENAMED
File without changes
data/us-founding/constitution.pdf DELETED
Binary file (414 kB)
 
data/us-founding/declaration-of-independance.pdf DELETED
Binary file (742 kB)
 
declaration-of-independance.pdf DELETED
Binary file (742 kB)