Spaces:

Sarthak005
/

Deploy.FastAPi.Application

Sleeping

App Files Files Community

Sarthak005 commited on Jan 25

Commit

2981d40

verified ·

1 Parent(s): 9d4be14

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +14 -0
Final_Research_Dataset_2.csv +0 -0
README.md +12 -12
app.py +554 -0
requirements.txt +12 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+# Use the official Python 3.10.9 image
+FROM python:3.12.6
+# Copy the current directory contents into the container at .
+COPY . .
+# Set the working directory to /
+WORKDIR /
+# Install requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /requirements.txt
+# Start the FastAPI app on port 7860, the default port expected by Spaces
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

Final_Research_Dataset_2.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: Deploy.FastAPi.Application
-emoji: ⚡
-colorFrom: red
-colorTo: gray
-sdk: docker
-pinned: false
-license: apache-2.0
-short_description: Journal-Finder
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Deploy.FastAPi.Application
+emoji: ⚡
+colorFrom: red
+colorTo: gray
+sdk: docker
+pinned: false
+license: apache-2.0
+short_description: Journal-Finder
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,554 @@

+from fastapi import FastAPI, HTTPException, Query
+from pydantic import BaseModel
+import os
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import FAISS
+from langchain_community.document_loaders import CSVLoader
+from langchain_openai import ChatOpenAI
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.chains.combine_documents import create_stuff_documents_chain
+from langchain.chains import create_retrieval_chain
+from langchain_google_genai import ChatGoogleGenerativeAI
+from dotenv import load_dotenv
+from fastapi.responses import PlainTextResponse
+from fastapi.middleware.cors import CORSMiddleware
+import asyncio
+import json
+import re
+# Load environment variables
+load_dotenv()
+os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
+os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
+os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+key = os.getenv("GOOGLE_API_KEY")
+# Define paths
+DB_FAISS_PATH = "bgi/db_faiss"
+# Initialize FastAPI app
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Add the React app's URL
+    allow_credentials=True,
+    allow_methods=["*"],  # Allow all HTTP methods
+    allow_headers=["*"],  # Allow all headers
+)
+# Initialize variables
+embeddings = None
+db = None
+# Load or create FAISS vector store
+@app.on_event("startup")
+def load_vector_store():
+    global embeddings, db
+    if os.path.exists(DB_FAISS_PATH):
+        print("Loading existing FAISS vector store.")
+        embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en', model_kwargs={'device': 'cpu'})
+        db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
+        print("Vector store loaded.")
+    else:
+        print("Creating new FAISS vector store.")
+        loader = CSVLoader(file_path="Final_Research_Dataset_2.csv", encoding="utf-8", csv_args={'delimiter': ','})
+        data = loader.load()
+        embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en', model_kwargs={'device': 'cpu'})
+        db = FAISS.from_documents(data, embeddings)
+        db.save_local(DB_FAISS_PATH)
+# Define request and response models
+from typing import List, Optional
+class FilterCriteria(BaseModel):
+    impactFactor: float
+    firstDecisionTime: int
+    publisher: Optional[str]
+    llmModel: str
+class QueryRequest(BaseModel):
+    abstract: str
+    criteria: FilterCriteria
+class Journal(BaseModel):
+    id: int
+    Name: str
+    JIF: float
+    Category: str
+    Keywords: str
+    Publisher: str
+    Decision_Time: int
+# Define the QueryResponse model with a list of journals
+class QueryResponse(BaseModel):
+    result: List[Journal]
+@app.get("/", response_class=PlainTextResponse)
+def read_root():
+    return "Welcome to the Journal Recommender API!"
+# Define models
+@app.get("/models")
+def get_models():
+    return {"available_models": ["openai", "groq","mixtral","gemini-pro","faiss"]}
+def fix_incomplete_json(raw_response):
+    """
+    Fixes incomplete JSON by adding missing braces or brackets.
+    Returns a valid JSON string or None if not fixable.
+    """
+    # Ensure the response ends with a closing bracket if it's a list
+    if raw_response.endswith("},"):
+        raw_response = raw_response[:-1]  # Remove the last comma
+    if raw_response.count("{") > raw_response.count("}"):
+        raw_response += "}"
+    if raw_response.count("[") > raw_response.count("]"):
+        raw_response += "]"
+    # Try to load the fixed response
+    try:
+        json_response = json.loads(raw_response)
+        return json_response
+    except json.JSONDecodeError as e:
+        print(f"Error fixing JSON: {e}")
+        return None
+# Query endpoint
+@app.post("/query", response_model=QueryResponse)
+async def query(request: QueryRequest):
+    global db
+    if not db:
+        raise HTTPException(status_code=500, detail="Vector store not loaded.")
+    query_text = request.abstract
+    model_choice = request.criteria.llmModel
+    impact_factor = request.criteria.impactFactor
+    preferred_publisher = request.criteria.publisher
+    # Perform the query
+    docs = db.similarity_search(query_text, k=5)
+    context = "\n".join([doc.page_content for doc in docs])
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "Give a strict comma-separated list of exactly 15 keywords from the following text. "
+                "Give a strict comma-separated list of exactly 15 keywords from the following text. "
+                "Do not include any bullet points, introductory text, or ending text. "
+                "No introductory or ending text strictly"  # Added to ensure can be removed if results deteriorate
+                "Do not say anything like 'Here are the keywords.' "
+                "Only return the keywords, strictly comma-separated, without any additional words."
+            ),
+        },
+        {"role": "user", "content": query_text},
+    ]
+    llm = ChatGroq(model="llama3-8b-8192", temperature=0)
+    ai_msg = llm.invoke(messages)
+    keywords = ai_msg.content.split("keywords extracted from the text:\n")[-1].strip()
+    print("Keywords:", keywords)
+    if model_choice == "openai":
+        retriever = db.as_retriever()
+        # Set up system prompt
+        system_prompt = (
+            f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
+            f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
+            f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
+            f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
+            f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
+            f"Ensure no introductory or ending texts are included. Give max 30 results"
+            "Context: {context}"
+        )
+        prompt = ChatPromptTemplate.from_messages(
+            [("system", system_prompt), ("user", "{input}")]
+        )
+        async def create_chain():
+            client = ChatOpenAI(model="gpt-4o")
+            return create_stuff_documents_chain(client, prompt)
+        # Create the question-answer chain using async function
+        question_answer_chain = await create_chain()
+        rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+        # Ensure the vector dimensions match the FAISS index
+        # Invoke the RAG chain
+        answer = rag_chain.invoke(
+            {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
+        )
+        # Inspect the result structure
+        result = []
+        raw_response = answer['answer']
+        cleaned_response = raw_response.strip('```json\n').strip('```').strip()
+        # Parse the cleaned JSON response
+        try:
+            json_response = json.loads(cleaned_response)
+            # Initialize an empty list to hold the journal objects
+            result = []
+            # Process the JSON data and create Journal objects
+            for i, journal in enumerate(json_response):
+                try:
+                    journal_name = journal.get('Journal Name')
+                    publisher = journal.get('Publisher')
+                    jif = float(journal.get('JIF', 0))  # Ensure valid float
+                    decision_time = journal.get('Decsion Time', 0)  # Default to 0 if not available
+                    # Only include if JIF is greater than the minimum threshold
+                    if jif > impact_factor:
+                        result.append(
+                            Journal(
+                                id=i + 1,
+                                Name=journal_name,
+                                Publisher=publisher,
+                                JIF=jif,
+                                Category="",  # Set to empty if not available
+                                Keywords=keywords,  # Use provided keywords
+                                Decision_Time=decision_time,
+                            )
+                        )
+                except Exception as e:
+                    print(f"Error processing journal data: {e}")
+        except json.JSONDecodeError as e:
+            print(f"Error parsing JSON response: {e}")
+            result = []
+        # Return the result wrapped in a QueryResponse
+        return QueryResponse(result=result)
+    elif model_choice == "groq":
+        retriever = db.as_retriever()
+        # Set up system prompt
+        system_prompt = (
+            f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
+            f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
+            f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
+            f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
+            f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
+            f"Ensure no introductory or ending texts are included. Dont give more than 10 results"
+            "Context: {context}"
+        )
+        prompt = ChatPromptTemplate.from_messages(
+            [("system", system_prompt), ("user", "{input}")]
+        )
+        # Create the question-answer chain
+        async def create_chain():
+            client = ChatGroq(model="llama-3.2-3b-preview", temperature=0)
+            return create_stuff_documents_chain(client, prompt)
+        # Create the question-answer chain using async function
+        question_answer_chain = await create_chain()
+        rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+        # Ensure the vector dimensions match the FAISS index
+        # Invoke the RAG chain
+        answer = rag_chain.invoke(
+            {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
+        )
+        # Inspect the result structure
+        result = []
+        raw_response = answer['answer']
+        cleaned_response = raw_response.strip('```json\n').strip('```').strip()
+        # Parse the cleaned JSON response
+        try:
+            # Parse the cleaned response
+            print("Cleaned Response:", cleaned_response)  # For debugging
+            json_response = json.loads(cleaned_response)
+            # Initialize an empty list to hold the journal objects
+            result = []
+            # Process the JSON data and create Journal objects
+            for i, journal in enumerate(json_response["journals"]):  # Accessing the 'journals' key
+                print("Journal entry:", journal)  # For debugging
+                try:
+                    if isinstance(journal, dict):  # Ensure journal is a dictionary
+                        journal_name = journal.get('Journal Name')
+                        publisher = journal.get('Publisher')
+                        jif = float(journal.get('JIF', 0))  # Ensure valid float
+                        decision_time = journal.get('Decision Time', 0)  # Default to 0 if not available
+                        # Only include if JIF is greater than the minimum threshold
+                        if jif > impact_factor:
+                            result.append(
+                                Journal(
+                                    id=i + 1,
+                                    Name=journal_name,
+                                    Publisher=publisher,
+                                    JIF=jif,
+                                    Category="",  # Set to empty if not available
+                                    Keywords=keywords,  # Use provided keywords
+                                    Decision_Time=decision_time,
+                                )
+                            )
+                    else:
+                        print(f"Skipping invalid journal entry: {journal}")
+                except Exception as e:
+                    print(f"Error processing journal data: {e}")
+        except json.JSONDecodeError as e:
+            print(f"Error parsing JSON response: {e}")
+            result = []
+        # Return the result wrapped in a QueryResponse
+        return QueryResponse(result=result)
+    elif model_choice == "mixtral":
+        retriever = db.as_retriever()
+        # Set up system prompt
+        system_prompt = (
+            f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
+            f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
+            f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
+            f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
+            f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
+            f"Ensure no introductory or ending texts are included. Dont give more than 10 results"
+            "Context: {context}"
+        )
+        prompt = ChatPromptTemplate.from_messages(
+            [("system", system_prompt), ("user", "{input}")]
+        )
+        # Create the question-answer chain
+        async def create_chain():
+            client = ChatGroq(model="mixtral-8x7b-32768",temperature=0)
+            return create_stuff_documents_chain(client, prompt)
+        # Create the question-answer chain using async function
+        question_answer_chain = await create_chain()
+        rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+        # Ensure the vector dimensions match the FAISS index
+        # Invoke the RAG chain
+        answer = rag_chain.invoke(
+            {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
+        )
+        # Inspect the result structure
+        result = []
+        raw_response = answer['answer']
+        cleaned_response = raw_response.strip('```json\n').strip('```').strip()
+        # Parse the cleaned JSON response
+        try:
+            # Parse the cleaned response
+            print("Cleaned Response:", cleaned_response)  # For debugging
+            json_response = json.loads(cleaned_response)
+            # Initialize an empty list to hold the journal objects
+            result = []
+            # Process the JSON data and create Journal objects
+            for i, journal in enumerate(json_response):  # Iterate directly over the list
+                print("Journal entry:", journal)  # For debugging
+                try:
+                    if isinstance(journal, dict):  # Ensure journal is a dictionary
+                        journal_name = journal.get('Journal Name')
+                        publisher = journal.get('Publisher')
+                        jif = float(journal.get('JIF', 0))  # Ensure valid float
+                        decision_time = journal.get('Decsion Time', 0)  # Default to 0 if not available
+                        # Only include if JIF is greater than the minimum threshold
+                        if jif > impact_factor:
+                            result.append(
+                                Journal(
+                                    id=i + 1,
+                                    Name=journal_name,
+                                    Publisher=publisher,
+                                    JIF=jif,
+                                    Category="",  # Set to empty if not available
+                                    Keywords=keywords,  # Use provided keywords
+                                    Decision_Time=decision_time,
+                                )
+                            )
+                    else:
+                        print(f"Skipping invalid journal entry: {journal}")
+                except Exception as e:
+                    print(f"Error processing journal data: {e}")
+        except json.JSONDecodeError as e:
+            print(f"Error parsing JSON response: {e}")
+            result = []
+        # Return the result wrapped in a QueryResponse
+        return QueryResponse(result=result)
+    elif model_choice == "gemini-pro":
+        print("Using Gemini-Pro model")
+        retriever = db.as_retriever()
+        # Set up system prompt
+        system_prompt = (
+            f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
+            f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
+            f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
+            f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
+            f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
+            f"Ensure no introductory or ending texts are included."
+            "Context: {context}"
+        )
+        prompt = ChatPromptTemplate.from_messages(
+            [("system", system_prompt), ("user", "{input}")]
+        )
+        async def create_chain():
+            client = ChatGoogleGenerativeAI(
+                model="gemini-pro",
+                google_api_key=key,
+                convert_system_message_to_human=True,
+            )
+            return create_stuff_documents_chain(client, prompt)
+        # Create the question-answer chain using async function
+        question_answer_chain = await create_chain()
+        rag_chain = create_retrieval_chain(retriever, question_answer_chain)
+        # Ensure the vector dimensions match the FAISS index
+        # Invoke the RAG chain
+        answer = rag_chain.invoke(
+            {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
+        )
+        # Inspect the result structure
+        result = []
+        raw_response = answer['answer']
+        cleaned_response = raw_response.strip('```json\n').strip('```').strip()
+        # Parse the cleaned JSON response
+        try:
+            json_response = json.loads(cleaned_response)
+            # Initialize an empty list to hold the journal objects
+            result = []
+            # Process the JSON data and create Journal objects
+            for i, journal in enumerate(json_response):
+                try:
+                    journal_name = journal.get('Journal Name')
+                    publisher = journal.get('Publisher')
+                    jif = float(journal.get('JIF', 0))  # Ensure valid float
+                    decision_time = journal.get('Decsion Time', 0)  # Default to 0 if not available
+                    # Only include if JIF is greater than the minimum threshold
+                    if jif > impact_factor:
+                        result.append(
+                            Journal(
+                                id=i + 1,
+                                Name=journal_name,
+                                Publisher=publisher,
+                                JIF=jif,
+                                Category="",  # Set to empty if not available
+                                Keywords=keywords,  # Use provided keywords
+                                Decision_Time=decision_time,
+                            )
+                        )
+                except Exception as e:
+                    print(f"Error processing journal data: {e}")
+        except json.JSONDecodeError as e:
+            print(f"Error parsing JSON response: {e}")
+            result = []
+        # Return the result wrapped in a QueryResponse
+        return QueryResponse(result=result)
+    elif model_choice == "faiss":
+        embeddings = HuggingFaceEmbeddings(
+            model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"}
+        )
+        jif = impact_factor  # Minimum JIF value for filtering
+        publisher = preferred_publisher  # Preferred publisher list or "no preference"
+        # Load the FAISS index from local storage
+        db1 = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
+        # Embed the query
+        query_embedding = embeddings.embed_query(keywords)
+        # Perform similarity search with FAISS (retrieve top 20 results)
+        results = db1.similarity_search_by_vector(query_embedding, k=20)
+        # Prepare the context for processing results
+        context = "\n\n".join(doc.page_content for doc in results)
+        # Apply filters for JIF and publisher
+        min_jif = jif
+        valid_publishers = publisher if publisher != ["no preference"] else None
+        # Split the output based on each entry starting with 'Name: '
+        entries = re.split(r"\n(?=Name:)", context.strip())
+        # Initialize an empty list to hold the Journal models
+        journal_list = []
+        # Process each entry
+        for entry in entries:
+            # Use regex to capture different fields
+            name = re.search(r"Name: (.+)", entry)
+            jif_match = re.search(r"JIF: (.+)", entry)
+            category = re.search(r"Category: (.+)", entry)
+            keywords_match = re.search(r"Keywords: (.+)", entry)
+            publisher_match = re.search(r"Publisher: (.+)", entry)
+            first_decision_match = re.search(r"Decsion Time: (.+)", entry)
+            if  jif_match :
+                # Extract values from regex matches
+                name_value = name.group(1).strip()
+                jif_value = float(jif_match.group(1).strip())
+                category_value = category.group(1).strip()
+                keywords_value = keywords_match.group(1).strip()
+                publisher_value = publisher_match.group(1).strip()
+                decision_time = first_decision_match.group(1).strip()
+                # Filter based on JIF and publisher preferences
+                if jif_value >= min_jif :
+                    # Create the Journal model instance
+                    journal = Journal(
+                        id=len(journal_list) + 1,  # Incrementing ID for each journal
+                        Name=name_value,
+                        JIF=jif_value,
+                        Category=category_value,
+                        Keywords=keywords_value,
+                        Publisher=publisher_value,
+                        Decision_Time = decision_time
+                    )
+                    # Add the journal to the list
+                    journal_list.append(journal)
+        # Return the list of journals as a response or process it further
+        return {"result": [journal.dict() for journal in journal_list]}
+    else:
+        raise HTTPException(status_code=400, detail="Invalid model choice.")
+    # Generate response using LLM
+    response = llm.predict(f"Context: {context}\n\nQuestion: {query_text}")
+    return QueryResponse(result=response)
+# Run the app with Uvicorn
+# Command: uvicorn app:app --reload

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi
+uvicorn
+pydantic
+python-dotenv
+langchain-community
+langchain-openai
+langchain-google-genai
+langchain-core
+langchain-groq
+faiss-cpu
+numpy
+sentence-transformers