from flask import Flask, render_template, request, jsonify, redirect, url_for, flash, session from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, current_user from flask_wtf.csrf import CSRFProtect from flask_wtf import FlaskForm from wtforms import StringField, PasswordField, SubmitField from wtforms.validators import DataRequired from werkzeug.security import generate_password_hash, check_password_hash import arxiv import requests import PyPDF2 from io import BytesIO from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_groq import ChatGroq from langchain.memory import ConversationBufferMemory from langchain_community.embeddings import HuggingFaceEmbeddings import numpy as np from concurrent.futures import ThreadPoolExecutor, TimeoutError from functools import lru_cache import time import os from dotenv import load_dotenv import json from datetime import datetime import firebase_admin from firebase_admin import credentials, auth # Load environment variables load_dotenv() app = Flask(__name__) app.secret_key = os.getenv('FLASK_SECRET_KEY') # Initialize CSRF protection csrf = CSRFProtect() csrf.init_app(app) # Initialize Flask-Login login_manager = LoginManager() login_manager.init_app(app) login_manager.login_view = 'login' # Initialize Groq groq_api_key = os.getenv('GROQ_API_KEY') llm = ChatGroq( temperature=0.3, groq_api_key=groq_api_key, model_name="qwen-2.5-32b" ) # Initialize embeddings with proper cache directory embeddings_model = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", cache_folder="/code/.cache/huggingface" ) # Constants MAX_CHUNKS = 50 MAX_RESPONSE_LENGTH = 6000 CACHE_DURATION = 3600 # 1 hour in seconds # Form Classes class LoginForm(FlaskForm): username = StringField('Username', validators=[DataRequired()]) password = PasswordField('Password', validators=[DataRequired()]) submit = SubmitField('Login') class RegisterForm(FlaskForm): username = StringField('Username', validators=[DataRequired()]) password = PasswordField('Password', validators=[DataRequired()]) submit = SubmitField('Register') # User class for Flask-Login class User(UserMixin): def __init__(self, user_id, email): self.id = user_id self.email = email def generate_analysis(chunks): analysis_prompts = { 'executive_summary': """ ## 🧠 Role You are an AI assistant that explains research papers in a way that makes reading the original paper unnecessary. Your explanations should be **clear, engaging, and easy to understand**, even for someone who is not deeply familiar with the subject. ## 🎯 Goal Given any research paper, provide a **simple breakdown** covering: ### 1️⃣ What problem does this paper solve? - Explain the **issue the paper addresses**. - Why is this problem **important**? - What **challenges** existed before this research? ### 2️⃣ How does it solve the problem? - Summarize the **key idea, method, or approach** used in the paper. - If applicable, break it down into **steps or components**. - Compare it to **previous solutions** and highlight what makes it better. ### 3️⃣ Why does this matter? (Real-world impact & applications) - How can this research be **used in practice**? - What **industries or fields** benefit from it? - Does it improve **efficiency, accuracy, cost, or scalability**? ### 4️⃣ Explain with a simple analogy (if applicable) - Use a **real-life example** to explain complex ideas. - Keep it **relatable** (e.g., compare it to something like cooking, traveling, or streaming music). ### 5️⃣ Key findings & results - Summarize the **main results** in simple terms. - If possible, include **numbers, graphs, or comparisons** for clarity. ### 6️⃣ Limitations & Future Work - Mention any **weaknesses** or areas for improvement. - What are the **next steps** for research in this area? ### 7️⃣ Final Takeaway (One-liner summary) - Provide a **quick summary** of the research in a **single sentence**. --- ## 🎨 Tone & Style ✔ **Simple & clear language** – Avoid jargon unless necessary. ✔ **Step-by-step explanations** – Organize information logically. ✔ **Engaging & structured** – Use bullet points, lists, or tables when needed. ✔ **Make it feel like a story** – Guide the reader smoothly from problem to solution. --- ## ⚡ How to Use This Prompt 1️⃣ Enter the **title, abstract, or full text** of any research paper. 2️⃣ AI will generate a **detailed explanation** that makes the paper easy to understand. 3️⃣ Use it for **blog posts, study guides, or an AI-powered research assistant**. Remember: The output should be properly formatted in markdown while providing comprehensive coverage of the paper's content.""" } analysis_results = {} for aspect, prompt in analysis_prompts.items(): try: # Clean and join the chunks context = "\n\n".join( chunk.encode('ascii', 'ignore').decode('ascii') for chunk in chunks[:3] ) response = llm.invoke( f"""Based on the following context from a research paper, {prompt} Context: {context} Additional Instructions: - Provide specific examples and evidence from the text - Use clear, academic language - Maintain objectivity - Include relevant quotes or data points - Structure your response logically - Use markdown formatting for clarity Please provide a clear and specific response.""", temperature=0.3 ) analysis_results[aspect] = response.content[:MAX_RESPONSE_LENGTH] except Exception as e: analysis_results[aspect] = f"Analysis failed: {str(e)}" return analysis_results def process_pdf(pdf_url): try: print(f"Starting PDF processing for: {pdf_url}") response = requests.get(pdf_url, timeout=30) response.raise_for_status() pdf_file = BytesIO(response.content) pdf_reader = PyPDF2.PdfReader(pdf_file) # Clean and normalize the text text = " ".join( page.extract_text().encode('ascii', 'ignore').decode('ascii') for page in pdf_reader.pages ) if not text.strip(): return {'error': 'No text could be extracted from the PDF'} text_splitter = RecursiveCharacterTextSplitter( chunk_size=2000, chunk_overlap=200, length_function=len, separators=["\n\n", "\n", " ", ""] ) chunks = text_splitter.split_text(text)[:MAX_CHUNKS] analysis = generate_analysis(chunks) return { 'success': True, 'analysis': analysis } except Exception as e: return {'error': f"PDF processing failed: {str(e)}"} @login_manager.user_loader def load_user(user_id): if 'user_data' in session: user_data = session['user_data'] return User(user_data['uid'], user_data['email']) return None # User management functions def load_users(): try: with open('users.json', 'r') as f: return json.load(f) except FileNotFoundError: return {} def save_users(users): with open('users.json', 'w') as f: json.dump(users, f) # Routes @app.route('/') @login_required def index(): return render_template('index.html') @app.route('/login', methods=['GET']) def login(): if current_user.is_authenticated: return redirect(url_for('index')) return render_template('login.html') @app.route('/register', methods=['GET']) def register(): if current_user.is_authenticated: print("User is already authenticated") return redirect(url_for('index')) return render_template('register.html') @app.route('/verify-token', methods=['POST']) def verify_token(): try: data = request.json if not data or not data.get('uid') or not data.get('email'): return jsonify({'error': 'Missing required data'}), 400 # Store user data in session session['user_data'] = { 'uid': data['uid'], 'email': data['email'] } # Create and login user user = User(data['uid'], data['email']) login_user(user) return jsonify({'success': True, 'redirect': url_for('index')}) except Exception as e: print(f"Verification error: {str(e)}") # Add logging return jsonify({'error': str(e)}), 500 @app.route('/logout') @login_required def logout(): logout_user() session.clear() return redirect(url_for('login')) @app.route('/search', methods=['POST']) @login_required def search(): try: data = request.get_json() paper_name = data.get('paper_name') sort_by = data.get('sort_by', 'relevance') max_results = data.get('max_results', 10) if not paper_name: return jsonify({'error': 'No search query provided'}), 400 # Map sort_by to arxiv.SortCriterion sort_mapping = { 'relevance': arxiv.SortCriterion.Relevance, 'lastUpdated': arxiv.SortCriterion.LastUpdatedDate, 'submitted': arxiv.SortCriterion.SubmittedDate } sort_criterion = sort_mapping.get(sort_by, arxiv.SortCriterion.Relevance) # Perform the search search = arxiv.Search( query=paper_name, max_results=max_results, sort_by=sort_criterion ) results = [] for paper in search.results(): results.append({ 'title': paper.title, 'authors': ', '.join(author.name for author in paper.authors), 'abstract': paper.summary, 'pdf_link': paper.pdf_url, 'arxiv_link': paper.entry_id, 'published': paper.published.strftime('%Y-%m-%d'), 'category': paper.primary_category, 'comment': paper.comment if hasattr(paper, 'comment') else None, 'doi': paper.doi if hasattr(paper, 'doi') else None }) return jsonify(results) except Exception as e: print(f"Search error: {str(e)}") return jsonify({'error': f'Failed to search papers: {str(e)}'}), 500 @app.route('/perform-rag', methods=['POST']) @login_required def perform_rag(): try: pdf_url = request.json.get('pdf_url') if not pdf_url: return jsonify({'error': 'PDF URL is required'}), 400 result = process_pdf(pdf_url) if 'error' in result: return jsonify({'error': result['error']}), 500 return jsonify(result) except Exception as e: return jsonify({'error': str(e)}), 500 @app.route('/chat-with-paper', methods=['POST']) @login_required def chat_with_paper(): try: pdf_url = request.json.get('pdf_url') question = request.json.get('question') if not pdf_url or not question: return jsonify({'error': 'PDF URL and question are required'}), 400 # Get PDF text and create chunks response = requests.get(pdf_url, timeout=30) response.raise_for_status() pdf_file = BytesIO(response.content) pdf_reader = PyPDF2.PdfReader(pdf_file) text = " ".join(page.extract_text() for page in pdf_reader.pages) if not text.strip(): return jsonify({'error': 'No text could be extracted from the PDF'}) # Create text chunks text_splitter = RecursiveCharacterTextSplitter( chunk_size=2000, chunk_overlap=200, length_function=len ) chunks = text_splitter.split_text(text)[:MAX_CHUNKS] # Generate embeddings for chunks chunk_embeddings = embeddings_model.embed_documents(chunks) # Generate embedding for the question question_embedding = embeddings_model.embed_query(question) # Find most relevant chunks using cosine similarity similarities = [] for chunk_embedding in chunk_embeddings: similarity = np.dot(question_embedding, chunk_embedding) / ( np.linalg.norm(question_embedding) * np.linalg.norm(chunk_embedding) ) similarities.append(similarity) # Get top 3 most relevant chunks top_chunk_indices = np.argsort(similarities)[-3:][::-1] relevant_chunks = [chunks[i] for i in top_chunk_indices] # Construct prompt with relevant context context = "\n\n".join(relevant_chunks) prompt = f"""Based on the following relevant excerpts from the research paper, please answer this question: {question} Context from paper: {context} Please provide a clear, specific, and accurate response based solely on the information provided in these excerpts. If the answer cannot be fully determined from the given context, please indicate this in your response.""" # Generate response using Groq response = llm.invoke(prompt) # Format and return response formatted_response = response.content.strip() # Add source citations source_info = "\n\nThis response is based on specific sections from the paper." return jsonify({ 'response': formatted_response + source_info, 'relevance_scores': [float(similarities[i]) for i in top_chunk_indices] }) except Exception as e: print(f"Chat error: {str(e)}") return jsonify({'error': f'Failed to process request: {str(e)}'}), 500 @app.route('/api/data', methods=['GET']) def get_data(): try: # Example: Get documents from a collection docs = load_users() data = [{doc_id: doc_data} for doc_id, doc_data in docs.items()] return jsonify(data), 200 except Exception as e: return jsonify({"error": str(e)}), 500 if __name__ == '__main__': app.run(host='0.0.0.0', port=7860)