Spaces:
Running
Running
from flask import Flask, render_template, request, jsonify, redirect, url_for, flash, session | |
from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, current_user | |
from flask_wtf.csrf import CSRFProtect | |
from flask_wtf import FlaskForm | |
from wtforms import StringField, PasswordField, SubmitField | |
from wtforms.validators import DataRequired | |
from werkzeug.security import generate_password_hash, check_password_hash | |
import arxiv | |
import requests | |
import PyPDF2 | |
from io import BytesIO | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_groq import ChatGroq | |
from langchain.memory import ConversationBufferMemory | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
import numpy as np | |
from concurrent.futures import ThreadPoolExecutor, TimeoutError | |
from functools import lru_cache | |
import time | |
import os | |
from dotenv import load_dotenv | |
import json | |
from datetime import datetime | |
import firebase_admin | |
from firebase_admin import credentials, auth | |
# Load environment variables | |
load_dotenv() | |
app = Flask(__name__) | |
app.secret_key = os.getenv('FLASK_SECRET_KEY') | |
# Initialize CSRF protection | |
csrf = CSRFProtect() | |
csrf.init_app(app) | |
# Initialize Flask-Login | |
login_manager = LoginManager() | |
login_manager.init_app(app) | |
login_manager.login_view = 'login' | |
# Initialize Groq | |
groq_api_key = os.getenv('GROQ_API_KEY') | |
llm = ChatGroq( | |
temperature=0.3, | |
groq_api_key=groq_api_key, | |
model_name="qwen-2.5-32b" | |
) | |
# Initialize embeddings with proper cache directory | |
embeddings_model = HuggingFaceEmbeddings( | |
model_name="sentence-transformers/all-MiniLM-L6-v2", | |
cache_folder="/code/.cache/huggingface" | |
) | |
# Constants | |
MAX_CHUNKS = 50 | |
MAX_RESPONSE_LENGTH = 6000 | |
CACHE_DURATION = 3600 # 1 hour in seconds | |
# Form Classes | |
class LoginForm(FlaskForm): | |
username = StringField('Username', validators=[DataRequired()]) | |
password = PasswordField('Password', validators=[DataRequired()]) | |
submit = SubmitField('Login') | |
class RegisterForm(FlaskForm): | |
username = StringField('Username', validators=[DataRequired()]) | |
password = PasswordField('Password', validators=[DataRequired()]) | |
submit = SubmitField('Register') | |
# User class for Flask-Login | |
class User(UserMixin): | |
def __init__(self, user_id, email): | |
self.id = user_id | |
self.email = email | |
def generate_analysis(chunks): | |
analysis_prompts = { | |
'executive_summary': """ | |
## 🧠 Role | |
You are an AI assistant that explains research papers in a way that makes reading the original paper unnecessary. Your explanations should be **clear, engaging, and easy to understand**, even for someone who is not deeply familiar with the subject. | |
## 🎯 Goal | |
Given any research paper, provide a **simple breakdown** covering: | |
### 1️⃣ What problem does this paper solve? | |
- Explain the **issue the paper addresses**. | |
- Why is this problem **important**? | |
- What **challenges** existed before this research? | |
### 2️⃣ How does it solve the problem? | |
- Summarize the **key idea, method, or approach** used in the paper. | |
- If applicable, break it down into **steps or components**. | |
- Compare it to **previous solutions** and highlight what makes it better. | |
### 3️⃣ Why does this matter? (Real-world impact & applications) | |
- How can this research be **used in practice**? | |
- What **industries or fields** benefit from it? | |
- Does it improve **efficiency, accuracy, cost, or scalability**? | |
### 4️⃣ Explain with a simple analogy (if applicable) | |
- Use a **real-life example** to explain complex ideas. | |
- Keep it **relatable** (e.g., compare it to something like cooking, traveling, or streaming music). | |
### 5️⃣ Key findings & results | |
- Summarize the **main results** in simple terms. | |
- If possible, include **numbers, graphs, or comparisons** for clarity. | |
### 6️⃣ Limitations & Future Work | |
- Mention any **weaknesses** or areas for improvement. | |
- What are the **next steps** for research in this area? | |
### 7️⃣ Final Takeaway (One-liner summary) | |
- Provide a **quick summary** of the research in a **single sentence**. | |
--- | |
## 🎨 Tone & Style | |
✔ **Simple & clear language** – Avoid jargon unless necessary. | |
✔ **Step-by-step explanations** – Organize information logically. | |
✔ **Engaging & structured** – Use bullet points, lists, or tables when needed. | |
✔ **Make it feel like a story** – Guide the reader smoothly from problem to solution. | |
--- | |
## ⚡ How to Use This Prompt | |
1️⃣ Enter the **title, abstract, or full text** of any research paper. | |
2️⃣ AI will generate a **detailed explanation** that makes the paper easy to understand. | |
3️⃣ Use it for **blog posts, study guides, or an AI-powered research assistant**. | |
Remember: The output should be properly formatted in markdown while providing comprehensive coverage of the paper's content.""" | |
} | |
analysis_results = {} | |
for aspect, prompt in analysis_prompts.items(): | |
try: | |
# Clean and join the chunks | |
context = "\n\n".join( | |
chunk.encode('ascii', 'ignore').decode('ascii') | |
for chunk in chunks[:3] | |
) | |
response = llm.invoke( | |
f"""Based on the following context from a research paper, {prompt} | |
Context: | |
{context} | |
Additional Instructions: | |
- Provide specific examples and evidence from the text | |
- Use clear, academic language | |
- Maintain objectivity | |
- Include relevant quotes or data points | |
- Structure your response logically | |
- Use markdown formatting for clarity | |
Please provide a clear and specific response.""", | |
temperature=0.3 | |
) | |
analysis_results[aspect] = response.content[:MAX_RESPONSE_LENGTH] | |
except Exception as e: | |
analysis_results[aspect] = f"Analysis failed: {str(e)}" | |
return analysis_results | |
def process_pdf(pdf_url): | |
try: | |
print(f"Starting PDF processing for: {pdf_url}") | |
response = requests.get(pdf_url, timeout=30) | |
response.raise_for_status() | |
pdf_file = BytesIO(response.content) | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
# Clean and normalize the text | |
text = " ".join( | |
page.extract_text().encode('ascii', 'ignore').decode('ascii') | |
for page in pdf_reader.pages | |
) | |
if not text.strip(): | |
return {'error': 'No text could be extracted from the PDF'} | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=2000, | |
chunk_overlap=200, | |
length_function=len, | |
separators=["\n\n", "\n", " ", ""] | |
) | |
chunks = text_splitter.split_text(text)[:MAX_CHUNKS] | |
analysis = generate_analysis(chunks) | |
return { | |
'success': True, | |
'analysis': analysis | |
} | |
except Exception as e: | |
return {'error': f"PDF processing failed: {str(e)}"} | |
def load_user(user_id): | |
if 'user_data' in session: | |
user_data = session['user_data'] | |
return User(user_data['uid'], user_data['email']) | |
return None | |
# User management functions | |
def load_users(): | |
try: | |
with open('users.json', 'r') as f: | |
return json.load(f) | |
except FileNotFoundError: | |
return {} | |
def save_users(users): | |
with open('users.json', 'w') as f: | |
json.dump(users, f) | |
# Routes | |
def index(): | |
return render_template('index.html') | |
def login(): | |
if current_user.is_authenticated: | |
return redirect(url_for('index')) | |
return render_template('login.html') | |
def register(): | |
if current_user.is_authenticated: | |
print("User is already authenticated") | |
return redirect(url_for('index')) | |
return render_template('register.html') | |
def verify_token(): | |
try: | |
data = request.json | |
if not data or not data.get('uid') or not data.get('email'): | |
return jsonify({'error': 'Missing required data'}), 400 | |
# Store user data in session | |
session['user_data'] = { | |
'uid': data['uid'], | |
'email': data['email'] | |
} | |
# Create and login user | |
user = User(data['uid'], data['email']) | |
login_user(user) | |
return jsonify({'success': True, 'redirect': url_for('index')}) | |
except Exception as e: | |
print(f"Verification error: {str(e)}") # Add logging | |
return jsonify({'error': str(e)}), 500 | |
def logout(): | |
logout_user() | |
session.clear() | |
return redirect(url_for('login')) | |
def search(): | |
try: | |
data = request.get_json() | |
paper_name = data.get('paper_name') | |
sort_by = data.get('sort_by', 'relevance') | |
max_results = data.get('max_results', 10) | |
if not paper_name: | |
return jsonify({'error': 'No search query provided'}), 400 | |
# Map sort_by to arxiv.SortCriterion | |
sort_mapping = { | |
'relevance': arxiv.SortCriterion.Relevance, | |
'lastUpdated': arxiv.SortCriterion.LastUpdatedDate, | |
'submitted': arxiv.SortCriterion.SubmittedDate | |
} | |
sort_criterion = sort_mapping.get(sort_by, arxiv.SortCriterion.Relevance) | |
# Perform the search | |
search = arxiv.Search( | |
query=paper_name, | |
max_results=max_results, | |
sort_by=sort_criterion | |
) | |
results = [] | |
for paper in search.results(): | |
results.append({ | |
'title': paper.title, | |
'authors': ', '.join(author.name for author in paper.authors), | |
'abstract': paper.summary, | |
'pdf_link': paper.pdf_url, | |
'arxiv_link': paper.entry_id, | |
'published': paper.published.strftime('%Y-%m-%d'), | |
'category': paper.primary_category, | |
'comment': paper.comment if hasattr(paper, 'comment') else None, | |
'doi': paper.doi if hasattr(paper, 'doi') else None | |
}) | |
return jsonify(results) | |
except Exception as e: | |
print(f"Search error: {str(e)}") | |
return jsonify({'error': f'Failed to search papers: {str(e)}'}), 500 | |
def perform_rag(): | |
try: | |
pdf_url = request.json.get('pdf_url') | |
if not pdf_url: | |
return jsonify({'error': 'PDF URL is required'}), 400 | |
result = process_pdf(pdf_url) | |
if 'error' in result: | |
return jsonify({'error': result['error']}), 500 | |
return jsonify(result) | |
except Exception as e: | |
return jsonify({'error': str(e)}), 500 | |
def chat_with_paper(): | |
try: | |
pdf_url = request.json.get('pdf_url') | |
question = request.json.get('question') | |
if not pdf_url or not question: | |
return jsonify({'error': 'PDF URL and question are required'}), 400 | |
# Get PDF text and create chunks | |
response = requests.get(pdf_url, timeout=30) | |
response.raise_for_status() | |
pdf_file = BytesIO(response.content) | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
text = " ".join(page.extract_text() for page in pdf_reader.pages) | |
if not text.strip(): | |
return jsonify({'error': 'No text could be extracted from the PDF'}) | |
# Create text chunks | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=2000, | |
chunk_overlap=200, | |
length_function=len | |
) | |
chunks = text_splitter.split_text(text)[:MAX_CHUNKS] | |
# Generate embeddings for chunks | |
chunk_embeddings = embeddings_model.embed_documents(chunks) | |
# Generate embedding for the question | |
question_embedding = embeddings_model.embed_query(question) | |
# Find most relevant chunks using cosine similarity | |
similarities = [] | |
for chunk_embedding in chunk_embeddings: | |
similarity = np.dot(question_embedding, chunk_embedding) / ( | |
np.linalg.norm(question_embedding) * np.linalg.norm(chunk_embedding) | |
) | |
similarities.append(similarity) | |
# Get top 3 most relevant chunks | |
top_chunk_indices = np.argsort(similarities)[-3:][::-1] | |
relevant_chunks = [chunks[i] for i in top_chunk_indices] | |
# Construct prompt with relevant context | |
context = "\n\n".join(relevant_chunks) | |
prompt = f"""Based on the following relevant excerpts from the research paper, please answer this question: {question} | |
Context from paper: | |
{context} | |
Please provide a clear, specific, and accurate response based solely on the information provided in these excerpts. If the answer cannot be fully determined from the given context, please indicate this in your response.""" | |
# Generate response using Groq | |
response = llm.invoke(prompt) | |
# Format and return response | |
formatted_response = response.content.strip() | |
# Add source citations | |
source_info = "\n\nThis response is based on specific sections from the paper." | |
return jsonify({ | |
'response': formatted_response + source_info, | |
'relevance_scores': [float(similarities[i]) for i in top_chunk_indices] | |
}) | |
except Exception as e: | |
print(f"Chat error: {str(e)}") | |
return jsonify({'error': f'Failed to process request: {str(e)}'}), 500 | |
def get_data(): | |
try: | |
# Example: Get documents from a collection | |
docs = load_users() | |
data = [{doc_id: doc_data} for doc_id, doc_data in docs.items()] | |
return jsonify(data), 200 | |
except Exception as e: | |
return jsonify({"error": str(e)}), 500 | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=7860) | |