Spaces:

ashmib
/

green-city-finder

Running

App Files Files Community

Ashmi Banerjee commited on Sep 13

Commit

89cd5d5

•

1 Parent(s): ac20456

refactored the vectordb

Browse files

Files changed (9) hide show

app.py +1 -1
src/__init__.py +2 -2
src/information_retrieval/info_retrieval.py +11 -7
src/vectordb/create_db.py +3 -1
src/vectordb/helpers.py +15 -1
src/vectordb/ingest.py +61 -0
src/vectordb/{lancedb_init.py → schema.py} +0 -0
src/vectordb/search.py +97 -0
src/vectordb/vectordb.py +0 -190

app.py CHANGED Viewed

@@ -56,7 +56,7 @@ def create_ui():
             "        ")
         with gr.Group():
-            countries = gr.Dropdown(choices=list(df.country), multiselect=False, label="Countries")
             starting_point = gr.Dropdown(choices=[], multiselect=False,
                                          label="Select your starting point for the trip!")

             "        ")
         with gr.Group():
+            countries = gr.Dropdown(choices=list(df.country.unique()), multiselect=False, label="Country")
             starting_point = gr.Dropdown(choices=[], multiselect=False,
                                          label="Select your starting point for the trip!")

src/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
-from src.vectordb.vectordb import *
 from src.vectordb.helpers import *
-from src.vectordb.lancedb_init import *
 from src.sustainability.s_fairness import *
 from src.information_retrieval.info_retrieval import *

+from src.vectordb.search import *
 from src.vectordb.helpers import *
+from src.vectordb.schema import *
 from src.sustainability.s_fairness import *
 from src.information_retrieval.info_retrieval import *

src/information_retrieval/info_retrieval.py CHANGED Viewed

@@ -2,8 +2,11 @@ import sys
 import re
 import os
 import json
 sys.path.append("../")
-from src.vectordb import vectordb
 from src.sustainability import s_fairness
 import logging
@@ -12,6 +15,7 @@ logging.basicConfig(encoding='utf-8', level=logging.DEBUG)
 from src.helpers.data_loaders import load_scores
 def get_travel_months(query):
     """
@@ -66,7 +70,7 @@ def get_wikivoyage_context(query, limit=10, reranking=0):
     # limit = params['limit']
     # reranking = params['reranking']
-    docs = vectordb.search_wikivoyage_docs(query, limit, reranking)
     logger.info("Finished getting chunked wikivoyage docs.")
     results = {}
@@ -76,7 +80,7 @@ def get_wikivoyage_context(query, limit=10, reranking=0):
     cities = [result['city'] for result in docs]
-    listings = vectordb.search_wikivoyage_listings(query, cities, limit, reranking)
     logger.info("Finished getting wikivoyage listings.")
     # logger.info(type(docs), type(listings))
@@ -92,7 +96,7 @@ def get_wikivoyage_context(query, limit=10, reranking=0):
     return results
-def get_sustainability_scores(starting_point: str , query: str, destinations: list):
     """
     Function to get the s-fairness scores for each destination for the given month (or the ideal month of travel if the user hasn't provided a month).
@@ -164,7 +168,7 @@ def get_cities(context: dict):
     """
     recommended_cities = []
     for city, info in context.items():
         city_info = {
             'city': city,
@@ -242,8 +246,8 @@ def test():
         # print(cities)
     except FileNotFoundError as e:
         try:
-            vectordb.create_wikivoyage_docs_db_and_add_data()
-            vectordb.create_wikivoyage_listings_db_and_add_data()
             try:
                 context = get_context(query, sustainability=1)

 import re
 import os
 import json
+from src.vectordb.ingest import create_wikivoyage_docs_db_and_add_data, create_wikivoyage_listings_db_and_add_data
 sys.path.append("../")
+from src.vectordb.search import search_wikivoyage_listings, search_wikivoyage_docs
 from src.sustainability import s_fairness
 import logging
 from src.helpers.data_loaders import load_scores
 def get_travel_months(query):
     """
     # limit = params['limit']
     # reranking = params['reranking']
+    docs = search_wikivoyage_docs(query, limit, reranking)
     logger.info("Finished getting chunked wikivoyage docs.")
     results = {}
     cities = [result['city'] for result in docs]
+    listings = search_wikivoyage_listings(query, cities, limit, reranking)
     logger.info("Finished getting wikivoyage listings.")
     # logger.info(type(docs), type(listings))
     return results
+def get_sustainability_scores(starting_point: str, query: str, destinations: list):
     """
     Function to get the s-fairness scores for each destination for the given month (or the ideal month of travel if the user hasn't provided a month).
     """
     recommended_cities = []
+    info = context[list(context.keys())[0]]
     for city, info in context.items():
         city_info = {
             'city': city,
         # print(cities)
     except FileNotFoundError as e:
         try:
+            create_wikivoyage_docs_db_and_add_data()
+            create_wikivoyage_listings_db_and_add_data()
             try:
                 context = get_context(query, sustainability=1)

src/vectordb/create_db.py CHANGED Viewed

@@ -1,9 +1,11 @@
-from src.vectordb.vectordb import *
 import logging
 logger = logging.getLogger(__name__)
 logging.basicConfig(encoding='utf-8', level=logging.DEBUG)
 def run():
     logging.info("Creating database for Wikivoyage Documents")

+from src.vectordb.search import *
 import logging
 logger = logging.getLogger(__name__)
 logging.basicConfig(encoding='utf-8', level=logging.DEBUG)
+from src.vectordb.ingest import create_wikivoyage_docs_db_and_add_data, create_wikivoyage_listings_db_and_add_data
 def run():
     logging.info("Creating database for Wikivoyage Documents")

src/vectordb/helpers.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import pandas as pd
 import os
 import re
@@ -7,7 +9,7 @@ import sys
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.dirname(SCRIPT_DIR))
-from data_directories import *
 def create_chunks(city, country, text):
@@ -148,3 +150,15 @@ def embed_query(query):
     # vector_dimension = model.get_sentence_embedding_dimension()
     embedding = model.encode(query).tolist()
     return embedding

+from typing import Optional
 import pandas as pd
 import os
 import re
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.dirname(SCRIPT_DIR))
+from src.data_directories import *
 def create_chunks(city, country, text):
     # vector_dimension = model.get_sentence_embedding_dimension()
     embedding = model.encode(query).tolist()
     return embedding
+def set_uri(run_local: Optional[bool] = False):
+    if run_local:
+        uri = database_dir
+        current_dir = os.path.split(os.getcwd())[1]
+        if "src" or "tests" in current_dir:  # hacky way to get the correct path
+            uri = uri.replace("../../", "../")
+    else:
+        uri = os.environ["BUCKET_NAME"]
+    return uri

src/vectordb/ingest.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from typing import Optional, Callable
+import logging
+logger = logging.getLogger(__name__)
+logging.basicConfig(encoding='utf-8', level=logging.DEBUG)
+from src.vectordb.helpers import read_docs, read_listings, preprocess_df
+from src.vectordb.schema import WikivoyageDocuments, WikivoyageListings
+from src.vectordb.helpers import set_uri
+import lancedb
+def _create_table_and_ingest_data(table_name: str, schema: object, data_fetcher: Callable,
+                                  preprocessor: Optional[Callable] = None):
+    """
+    Generalized function to create a table and ingest data into the database.
+    Args:
+        - table_name: str, name of the table to create.
+        - schema: object, schema of the table.
+        - data_fetcher: Callable, function to fetch the data.
+        - preprocessor: Optional[Callable], function to preprocess the data (default is None).
+    """
+    uri = set_uri()
+    db = lancedb.connect(uri)
+    logger.info(f"Connected to DB. Reading data for table {table_name} now...")
+    df = data_fetcher()
+    if preprocessor:
+        df = preprocessor(df)
+    logger.info(f"Finished reading data for {table_name}, attempting to create table and ingest the data...")
+    db.drop_table(table_name, ignore_missing=True)
+    table = db.create_table(table_name, schema=schema)
+    table.add(df.to_dict('records'))
+    logger.info(f"Completed ingestion for {table_name}.")
+def create_wikivoyage_docs_db_and_add_data():
+    """
+    Creates the Wikivoyage documents table and ingests data.
+    """
+    _create_table_and_ingest_data(
+        table_name="wikivoyage_documents",
+        schema=WikivoyageDocuments,
+        data_fetcher=read_docs,
+        preprocessor=preprocess_df
+    )
+def create_wikivoyage_listings_db_and_add_data():
+    """
+    Creates the Wikivoyage listings table and ingests data.
+    """
+    _create_table_and_ingest_data(
+        table_name="wikivoyage_listings",
+        schema=WikivoyageListings,
+        data_fetcher=read_listings
+    )

src/vectordb/{lancedb_init.py → schema.py} RENAMED Viewed

File without changes

src/vectordb/search.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# from src import *
+import logging
+import os
+import lancedb
+from lancedb.rerankers import ColbertReranker
+import sys
+logger = logging.getLogger(__name__)
+from typing import Optional
+from src.vectordb.helpers import set_uri
+# db = lancedb.connect("/tmp/db")
+def search(query: str, table_name: str, filter_condition: Optional[str] = None,
+           category: str = "docs", limit: int = 10, reranking: int = 0,
+           run_local: Optional[bool] = False) -> list | None:
+    """
+    Generalized function to search a database table, with optional filters and reranking.
+    Args:
+        - query: str, search query.
+        - table_name: str, name of the table to search.
+        - filter_condition: Optional[str], optional SQL-like condition for filtering results.
+        - category: str, type of category (default is 'docs').
+        - limit: int, number of results (default is 10).
+        - reranking: int (0 or 1), if activated, ColbertReranker is used.
+        - run_local: Optional[bool], whether to run in a local environment.
+    Returns:
+        A list of the most relevant documents or listings based on the category.
+    """
+    uri = set_uri(run_local)
+    try:
+        db = lancedb.connect(uri)
+    except Exception as e:
+        logger.error(f"Error while connecting to DB: {e}")
+        return None
+    logger.info(f"Connected to {table_name} DB.")
+    table = db.open_table(table_name)
+    search_query = table.search(query).metric('cosine')
+    if filter_condition:
+        search_query = search_query.where(filter_condition)
+    if reranking:
+        try:
+            column = 'description' if category == 'listings' else 'text'
+            reranker = ColbertReranker(column=column)
+            results = search_query.rerank(reranker=reranker).limit(limit).to_list()
+        except Exception as e:
+            exc_type, exc_obj, exc_tb = sys.exc_info()
+            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
+            logger.error(f"Error while reranking results: {e}, {(exc_type, fname, exc_tb.tb_lineno)}")
+            return None
+    else:
+        try:
+            results = search_query.limit(limit).to_list()
+        except Exception as e:
+            exc_type, exc_obj, exc_tb = sys.exc_info()
+            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
+            logger.error(f"Error while searching: {e}, {(exc_type, fname, exc_tb.tb_lineno)}")
+            return None
+    logger.info("Found the most relevant documents.")
+    if category == "docs":
+        return [{"city": r['city'], "country": r['country'], "section": r['section'], "text": r['text']} for r in
+                results]
+    else:
+        return [{"city": r['city'], "country": r['country'], "type": r['type'], "title": r['title'],
+                 "description": r['description']} for r in results]
+def search_wikivoyage_docs(query: str, limit: int = 10, reranking: int = 0,
+                           run_local: Optional[bool] = False) -> list | None:
+    """
+    Function to search documents in the Wikivoyage database.
+    """
+    return search(query=query, table_name="wikivoyage_documents", category="docs",
+                  limit=limit, reranking=reranking, run_local=run_local)
+def search_wikivoyage_listings(query: str, cities: list, limit: int = 10, reranking: int = 0,
+                               run_local: Optional[bool] = False) -> list | None:
+    """
+    Function to search listings in the Wikivoyage database, post-filtered by cities.
+    """
+    cities_filter = f"city IN {tuple(cities)}"
+    return search(query=query, table_name="wikivoyage_listings", filter_condition=cities_filter,
+                  category="listings", limit=limit, reranking=reranking, run_local=run_local)

src/vectordb/vectordb.py DELETED Viewed

@@ -1,190 +0,0 @@
-# from src import *
-from src.vectordb.helpers import *
-from src.vectordb.lancedb_init import *
-import logging
-import os
-import lancedb
-from lancedb.rerankers import ColbertReranker
-import sys
-logger = logging.getLogger(__name__)
-from typing import Optional
-# db = lancedb.connect("/tmp/db")
-def create_wikivoyage_docs_db_and_add_data():
-    """
-    Creates wikivoyage documents table and ingests data
-    """
-    uri = database_dir
-    current_dir = os.path.split(os.getcwd())[1]
-    if "src" or "tests" in current_dir: # hacky way to get the correct path
-        uri = uri.replace("../../", "../")
-    db = lancedb.connect(uri)
-    logger.info("Connected to DB. Reading data now...")
-    df = read_docs()
-    filtered_df = preprocess_df(df)
-    logger.info("Finished reading data, attempting to create table and ingest the data...")
-    db.drop_table("wikivoyage_documents", ignore_missing=True)
-    table = db.create_table("wikivoyage_documents", schema=WikivoyageDocuments)
-    table.add(filtered_df.to_dict('records'))
-    logger.info("Completed ingestion.")
-def create_wikivoyage_listings_db_and_add_data():
-    """
-    Creates wikivoyage listings table and ingests data
-    """
-    uri = database_dir
-    current_dir = os.path.split(os.getcwd())[1]
-    if "src" or "tests" in current_dir: # hacky way to get the correct path
-        uri = uri.replace("../../", "../")
-    db = lancedb.connect(uri)
-    logger.info("Connected to DB. Reading data now...")
-    df = read_listings()
-    logger.info("Finished reading data, attempting to create table and ingest the data...")
-    # filtered_df = preprocess_df(df)
-    db.drop_table("wikivoyage_listings", ignore_missing=True)
-    table = db.create_table("wikivoyage_listings", schema=WikivoyageListings)
-    table.add(df.astype('str').to_dict('records'))
-    logger.info("Completed ingestion.")
-def search_wikivoyage_docs(query: str, limit: int = 10, reranking: int = 0, run_local: Optional[bool] = False):
-    """
-    Function to search the wikivoyage database an return most relevant chunked docs.
-    Args:
-        - query: str
-        - limit: number of results (default is 10)
-        - reranking: bool (0 or 1), if activated, CrossEncoderReranker is used.
-    """
-    if run_local:
-        uri = database_dir
-        current_dir = os.path.split(os.getcwd())[1]
-        if "src" or "tests" in current_dir: # hacky way to get the correct path
-            uri = uri.replace("../../", "../")
-    else:
-        uri = os.environ["BUCKET_NAME"]
-    # print(uri)
-    try:
-        db = lancedb.connect(uri)
-    except Exception as e:
-        logger.error(f"Error while connecting to DB: {e}")
-    logger.info("Connected to Wikivoyage DB.")
-    print("Tablenames: ", db.table_names())
-    # query_embedding = embed_query(query)
-    table = db.open_table("wikivoyage_documents")
-    if reranking:
-        try:
-            reranker = ColbertReranker(column='text')
-            results = table.search(query) \
-                .metric('cosine') \
-                .rerank(reranker=reranker) \
-                .limit(limit) \
-                .to_list()
-        except Exception as e:
-            exc_type, exc_obj, exc_tb = sys.exc_info()
-            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
-            logger.error(f"Error while getting context: {e}, {(exc_type, fname, exc_tb.tb_lineno)}")
-    else:
-        try:
-            results = table.search(query) \
-                .limit(limit) \
-                .metric('cosine') \
-                .to_list()
-        except Exception as e:
-            exc_type, exc_obj, exc_tb = sys.exc_info()
-            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
-            logger.error(f"Error while getting context: {e}, {(exc_type, fname, exc_tb.tb_lineno)}")
-    logger.info("Found the most relevant documents.")
-    city_lists = [{"city": r['city'], "country": r['country'], "section": r['section'], "text": r['text']} for r in
-                  results]
-    # context = [f"city: {r['city']}, country: {r['country']}, name: {r['title']}, description: {r['description']}"
-    # for r in results]
-    return city_lists
-def search_wikivoyage_listings(query:str, cities: list, limit: int=10, reranking: int = 0, run_local: Optional[bool] = False):
-    """
-    Function to search the wikivoyage database an return most relevant listings, post-filtered by the recommended
-    cities provided by wikivoyage_documents table.
-    Args:
-        - query: str
-        - cities: list
-        - limit: number of results (default is 10)
-        - reranking: bool (0 or 1), if activated, CrossEncoderReranker is used.
-    """
-    if run_local:
-        uri = database_dir
-        current_dir = os.path.split(os.getcwd())[1]
-        if "src" or "tests" in current_dir: # hacky way to get the correct path
-            uri = uri.replace("../../", "../")
-    else:
-        uri = os.environ["BUCKET_NAME"]
-    db = lancedb.connect(uri)
-    logger.info("Connected to Wikivoyage Listings DB.")
-    table = db.open_table("wikivoyage_listings")
-    cities_filter = f"city IN {tuple(cities)}"
-    if reranking:
-        try:
-            reranker = ColbertReranker(column='description')
-            results = table.search(query) \
-                .where(cities_filter) \
-                .metric('cosine') \
-                .rerank(reranker=reranker) \
-                .limit(limit) \
-                .to_list()
-        except Exception as e:
-            exc_type, exc_obj, exc_tb = sys.exc_info()
-            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
-            logger.error(f"Error while getting context: {e}, {(exc_type, fname, exc_tb.tb_lineno)}")
-    else:
-        try:
-            results = table.search(query) \
-                .where(cities_filter) \
-                .metric('cosine') \
-                .limit(limit) \
-                .to_list()
-        except Exception as e:
-            exc_type, exc_obj, exc_tb = sys.exc_info()
-            fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
-            logger.error(f"Error while getting context: {e}, {(exc_type, fname, exc_tb.tb_lineno)}")
-    logger.info("Found the most relevant documents.")
-    city_listings = [{"city": r['city'], "country": r['country'], "type": r['type'], "title": r['title'],
-                      "description": r['description']} for r in results]
-    return city_listings