Spaces:

vishalsh13
/

VectorDBConversionfromfiles

Runtime error

App Files Files Community

vishalsh13 commited on Jan 11

Commit

e4dba65

1 Parent(s): ecf18d6

commit as its working on local

Browse files

Files changed (12) hide show

.gitignore +3 -0
app/__init__.py +12 -1
app/__pycache__/__init__.cpython-310.pyc +0 -0
app/__pycache__/routes.cpython-310.pyc +0 -0
app/routes.py +54 -20
app/templates/index.html +12 -43
app/utils/__pycache__/vector_db.cpython-310.pyc +0 -0
app/utils/__pycache__/zip_handler.cpython-310.pyc +0 -0
app/utils/file_handler.py +0 -4
app/utils/vector_db.py +15 -17
app/utils/zip_handler.py +5 -4
run.py +3 -2

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv\
+.venv
+_pycache_

app/__init__.py CHANGED Viewed

@@ -1,8 +1,19 @@
 from flask import Flask
 def create_app():
     v_app = Flask(__name__)
-    v_app.config['UPLOAD_FOLDER'] = 'app/uploads/'
     v_app.config['ALLOWED_EXTENSIONS'] = {'zip'}
     with v_app.app_context():

+# app/__init__.py
+import os
 from flask import Flask
 def create_app():
     v_app = Flask(__name__)
+    # Get the absolute path of the "app" folder
+    v_base_dir = os.path.dirname(os.path.abspath(__file__))
+    # This should point to your "app" folder, for example:
+    # C:\Users\sharm\Documents\projects\VectorDBConversionfromfiles\app
+    # Now "uploads" will be: C:\Users\sharm\Documents\projects\VectorDBConversionfromfiles\app\uploads
+    v_uploads_folder = os.path.join(v_base_dir, 'uploads')
+    v_app.config['UPLOAD_FOLDER'] = v_uploads_folder
     v_app.config['ALLOWED_EXTENSIONS'] = {'zip'}
     with v_app.app_context():

app/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (564 Bytes)

app/__pycache__/routes.cpython-310.pyc DELETED Viewed

Binary file (1.1 kB)

app/routes.py CHANGED Viewed

@@ -1,38 +1,72 @@
 import os
-from flask import Blueprint, render_template, request, send_file, jsonify
-from .utils.vector_db import process_files_to_vectors
-from .utils.zip_handler import handle_zip_upload  # We'll create this utility
 import zipfile
 v_bp = Blueprint('routes', __name__)
 @v_bp.route('/', methods=['GET', 'POST'])
 def home():
     if request.method == 'POST':
         v_uploaded_file = request.files.get('file')
         if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
-            return jsonify({'error': 'Please upload a valid zip file.'}), 400
-        # Save uploaded ZIP
-        v_upload_path = os.path.join('app/uploads', v_uploaded_file.filename)
         v_uploaded_file.save(v_upload_path)
-        # Extract the ZIP
-        v_extracted_folder = handle_zip_upload(v_upload_path)
-        # Process to create or update vector DB
-        v_result_folder = process_files_to_vectors(v_extracted_folder)
-        # Zip the resulting vectors folder for download
-        v_result_zip_path = os.path.join('app/uploads', 'vector_db_result.zip')
-        obj_zip = zipfile.ZipFile(v_result_zip_path, 'w', zipfile.ZIP_DEFLATED)
-        for v_root, _, v_files in os.walk(v_result_folder):
-            for v_file in v_files:
-                v_full_path = os.path.join(v_root, v_file)
-                v_arcname = os.path.relpath(v_full_path, start=v_result_folder)
-                obj_zip.write(v_full_path, arcname=v_arcname)
-        obj_zip.close()
-        return send_file(v_result_zip_path, as_attachment=True)
     return render_template('index.html')

 import os
 import zipfile
+import shutil
+import datetime
+from flask import Blueprint, render_template, request, send_file, jsonify, current_app
+from .utils.zip_handler import handle_zip_upload
+from .utils.vector_db import process_files_to_vectors
 v_bp = Blueprint('routes', __name__)
 @v_bp.route('/', methods=['GET', 'POST'])
 def home():
     if request.method == 'POST':
+        # 1. Validate the uploaded file
         v_uploaded_file = request.files.get('file')
         if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
+            return jsonify({'error': 'Please upload a valid zip file (.zip).'}), 400
+        # 2. Get the uploads folder from Flask config
+        v_uploads_folder = current_app.config['UPLOAD_FOLDER']
+        # Example: "app/uploads"
+        # 3. Clean up the existing contents in uploads folder
+        cleanup_uploads_folder(v_uploads_folder)
+        os.makedirs(v_uploads_folder, exist_ok=True)
+        # 4. Save the new ZIP file into the uploads folder
+        v_upload_path = os.path.join(v_uploads_folder, v_uploaded_file.filename)
         v_uploaded_file.save(v_upload_path)
+        # 5. Extract the ZIP file (directly into app/uploads)
+        handle_zip_upload(v_upload_path)
+        # 6. Process all extracted files to create/update vector DB in app/uploads/vectors
+        process_files_to_vectors(v_uploads_folder)
+        # 7. Build a timestamped filename for the final zip
+        v_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        v_zip_filename = f"uploads_all_{v_timestamp}.zip"
+        # Final zip will be inside app/uploads
+        v_final_zip_path = os.path.join(v_uploads_folder, v_zip_filename)
+        # 8. Zip the entire app/uploads folder (skip the final zip to prevent recursion)
+        with zipfile.ZipFile(v_final_zip_path, 'w', zipfile.ZIP_DEFLATED) as obj_zip:
+            for v_root, _, v_files in os.walk(v_uploads_folder):
+                for v_file in v_files:
+                    # Exclude the new zip itself from being re-zipped
+                    if v_file == v_zip_filename:
+                        continue
+                    v_full_path = os.path.join(v_root, v_file)
+                    v_arcname = os.path.relpath(v_full_path, start=v_uploads_folder)
+                    obj_zip.write(v_full_path, arcname=v_arcname)
+        # 9. Send the final zip file to the user
+        return send_file(v_final_zip_path, as_attachment=True)
+    # If GET, render the upload form
     return render_template('index.html')
+def cleanup_uploads_folder(v_folder_path):
+    """
+    Deletes all files/folders inside v_folder_path.
+    """
+    if os.path.exists(v_folder_path):
+        for v_item in os.listdir(v_folder_path):
+            v_item_path = os.path.join(v_folder_path, v_item)
+            if os.path.isfile(v_item_path):
+                os.remove(v_item_path)
+            elif os.path.isdir(v_item_path):
+                shutil.rmtree(v_item_path)

app/templates/index.html CHANGED Viewed

@@ -3,50 +3,19 @@
 <head>
     <meta charset="UTF-8" />
     <title>Vector DB Creator</title>
-    <style>
-        body {
-            font-family: Arial, sans-serif;
-            margin: 40px;
-        }
-        header {
-            display: flex;
-            align-items: center;
-        }
-        header img {
-            margin-right: 20px;
-            width: 50px;
-        }
-        h1 {
-            margin: 0;
-        }
-        .content {
-            margin-top: 20px;
-        }
-        form {
-            margin-top: 20px;
-        }
-        label {
-            display: inline-block;
-            width: 120px;
-        }
-    </style>
 </head>
 <body>
-    <header>
-        <img src="{{ url_for('static', filename='logo.png') }}" alt="Logo" />
-        <h1>Change your PDF, PPT, and CSV data to Vector DB</h1>
-    </header>
-    <div class="content">
-        <p>This application allows you to upload a .zip containing your data files (PDF, PPTX, or CSV)
-        and convert them into a vector database. Then you can download the processed vector DB as a .zip.</p>
-        <form action="/" method="POST" enctype="multipart/form-data">
-            <label for="file">Upload ZIP File:</label>
-            <input type="file" name="file" id="file" accept=".zip" required />
-            <button type="submit">Upload & Convert</button>
-        </form>
-        <p>After processing, you will be prompted to download the resulting vector database.</p>
-    </div>
 </body>
 </html>

 <head>
     <meta charset="UTF-8" />
     <title>Vector DB Creator</title>
 </head>
 <body>
+    <h1>Upload Your .ZIP File</h1>
+    <p>
+        On upload, the existing <strong>uploads</strong> folder is cleared.
+        Then, your new files are extracted, vector DB is created,
+        and finally, we zip the entire <strong>uploads</strong> folder
+        with a timestamped name.
+    </p>
+    <form action="/" method="POST" enctype="multipart/form-data">
+        <label for="file">Choose ZIP File:</label>
+        <input type="file" name="file" id="file" accept=".zip" required>
+        <button type="submit">Upload & Process</button>
+    </form>
 </body>
 </html>

app/utils/__pycache__/vector_db.cpython-310.pyc DELETED Viewed

Binary file (1.92 kB)

app/utils/__pycache__/zip_handler.cpython-310.pyc DELETED Viewed

Binary file (494 Bytes)

app/utils/file_handler.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import os
 import fitz  # PyMuPDF
 import pandas as pd
 from pptx import Presentation
@@ -9,14 +8,12 @@ def extract_text_from_file(v_file_path):
     """
     v_text = ""
-    # PDF
     if v_file_path.lower().endswith('.pdf'):
         obj_pdf = fitz.open(v_file_path)
         for obj_page in obj_pdf:
             v_text += obj_page.get_text()
         obj_pdf.close()
-    # PPTX
     elif v_file_path.lower().endswith('.pptx'):
         obj_ppt = Presentation(v_file_path)
         for obj_slide in obj_ppt.slides:
@@ -24,7 +21,6 @@ def extract_text_from_file(v_file_path):
                 if obj_shape.has_text_frame:
                     v_text += obj_shape.text_frame.text + "\n"
-    # CSV
     elif v_file_path.lower().endswith('.csv'):
         v_data = pd.read_csv(v_file_path)
         v_text += v_data.to_string()

 import fitz  # PyMuPDF
 import pandas as pd
 from pptx import Presentation
     """
     v_text = ""
     if v_file_path.lower().endswith('.pdf'):
         obj_pdf = fitz.open(v_file_path)
         for obj_page in obj_pdf:
             v_text += obj_page.get_text()
         obj_pdf.close()
     elif v_file_path.lower().endswith('.pptx'):
         obj_ppt = Presentation(v_file_path)
         for obj_slide in obj_ppt.slides:
                 if obj_shape.has_text_frame:
                     v_text += obj_shape.text_frame.text + "\n"
     elif v_file_path.lower().endswith('.csv'):
         v_data = pd.read_csv(v_file_path)
         v_text += v_data.to_string()

app/utils/vector_db.py CHANGED Viewed

@@ -5,50 +5,48 @@ import torch
 from sentence_transformers import SentenceTransformer
 from .file_handler import extract_text_from_file
-# Determine if GPU is available
 v_device = "cuda" if torch.cuda.is_available() else "cpu"
 obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
 def process_files_to_vectors(v_folder_path):
     """
-    Processes files (PDF, PPTX, CSV) to create/update a FAISS vector database.
-    Returns the path to the folder containing the FAISS index and metadata.
     """
     v_vector_folder = os.path.join(v_folder_path, 'vectors')
     os.makedirs(v_vector_folder, exist_ok=True)
-    # Create a brand-new FAISS index
-    # For sentence-transformers/all-MiniLM-L6-v2, embedding dimension is 384
     v_index = faiss.IndexFlatL2(384)
     v_metadata = {}
-    # Iterate over extracted files
     v_doc_counter = 0
     for v_root, _, v_files in os.walk(v_folder_path):
         for v_file in v_files:
             v_file_path = os.path.join(v_root, v_file)
             if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
                 v_text = extract_text_from_file(v_file_path)
                 if not v_text.strip():
-                    continue  # Skip empty content
-                # Encode text into embeddings
-                # convert_to_tensor=True yields a PyTorch tensor, so convert to numpy
                 v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
-                # Add to FAISS index
                 v_index.add(v_embeddings)
-                # Map index ID to filename
-                v_metadata[v_doc_counter] = v_file_path
                 v_doc_counter += 1
-    # Save FAISS index
     v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
     faiss.write_index(v_index, v_index_path)
-    # Save metadata
     import json
     with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
         json.dump(v_metadata, obj_meta, indent=4)

 from sentence_transformers import SentenceTransformer
 from .file_handler import extract_text_from_file
 v_device = "cuda" if torch.cuda.is_available() else "cpu"
 obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
 def process_files_to_vectors(v_folder_path):
     """
+    Processes files (PDF, PPTX, CSV) found in v_folder_path to build a FAISS vector DB.
+    Stores only the reference path in metadata.json (no file name or full path).
     """
     v_vector_folder = os.path.join(v_folder_path, 'vectors')
     os.makedirs(v_vector_folder, exist_ok=True)
+    # Create a FAISS index (384 dimensions for all-MiniLM-L6-v2)
     v_index = faiss.IndexFlatL2(384)
     v_metadata = {}
     v_doc_counter = 0
     for v_root, _, v_files in os.walk(v_folder_path):
         for v_file in v_files:
             v_file_path = os.path.join(v_root, v_file)
+            # Filter files by extension
             if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
                 v_text = extract_text_from_file(v_file_path)
                 if not v_text.strip():
+                    continue  # skip empty files
+                # Convert text to embeddings
                 v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
                 v_index.add(v_embeddings)
+                # Generate a relative path (reference path only)
+                v_reference_path = os.path.relpath(v_file_path, start=v_folder_path)
+                # Store only the reference path in metadata
+                v_metadata[v_doc_counter] = v_reference_path
                 v_doc_counter += 1
+    # Save the FAISS index
     v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
     faiss.write_index(v_index, v_index_path)
+    # Save metadata (containing only reference paths)
     import json
     with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
         json.dump(v_metadata, obj_meta, indent=4)

app/utils/zip_handler.py CHANGED Viewed

@@ -3,9 +3,10 @@ import zipfile
 def handle_zip_upload(v_zip_path):
     """
-    Extracts ZIP file contents into a subfolder of app/uploads.
     """
-    v_extracted_path = os.path.splitext(v_zip_path)[0]
     with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
-        obj_zip.extractall(v_extracted_path)
-    return v_extracted_path

 def handle_zip_upload(v_zip_path):
     """
+    Extracts the ZIP into the same folder as v_zip_path.
+    If v_zip_path = 'app/uploads/myfile.zip', extracts into 'app/uploads'.
     """
+    v_parent_folder = os.path.dirname(v_zip_path)
     with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
+        obj_zip.extractall(v_parent_folder)
+    return v_parent_folder

run.py CHANGED Viewed

@@ -2,5 +2,6 @@ from app import create_app
 if __name__ == "__main__":
     obj_app = create_app()
-    # Run the Flask app on 0.0.0.0:7860 for Hugging Face Spaces or local Docker
-    obj_app.run(host='0.0.0.0', port=7860)

 if __name__ == "__main__":
     obj_app = create_app()
+    # Host 0.0.0.0 so it's accessible in Docker or on local network
+    # Port 7860 is an example; you can choose another
+    obj_app.run(host='0.0.0.0', port=7860, debug=False)