Spaces:
Runtime error
Runtime error
Commit
·
e4dba65
1
Parent(s):
ecf18d6
commit as its working on local
Browse files- .gitignore +3 -0
- app/__init__.py +12 -1
- app/__pycache__/__init__.cpython-310.pyc +0 -0
- app/__pycache__/routes.cpython-310.pyc +0 -0
- app/routes.py +54 -20
- app/templates/index.html +12 -43
- app/utils/__pycache__/vector_db.cpython-310.pyc +0 -0
- app/utils/__pycache__/zip_handler.cpython-310.pyc +0 -0
- app/utils/file_handler.py +0 -4
- app/utils/vector_db.py +15 -17
- app/utils/zip_handler.py +5 -4
- run.py +3 -2
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.venv\
|
2 |
+
.venv
|
3 |
+
_pycache_
|
app/__init__.py
CHANGED
@@ -1,8 +1,19 @@
|
|
|
|
|
|
1 |
from flask import Flask
|
2 |
|
3 |
def create_app():
|
4 |
v_app = Flask(__name__)
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
v_app.config['ALLOWED_EXTENSIONS'] = {'zip'}
|
7 |
|
8 |
with v_app.app_context():
|
|
|
1 |
+
# app/__init__.py
|
2 |
+
import os
|
3 |
from flask import Flask
|
4 |
|
5 |
def create_app():
|
6 |
v_app = Flask(__name__)
|
7 |
+
|
8 |
+
# Get the absolute path of the "app" folder
|
9 |
+
v_base_dir = os.path.dirname(os.path.abspath(__file__))
|
10 |
+
# This should point to your "app" folder, for example:
|
11 |
+
# C:\Users\sharm\Documents\projects\VectorDBConversionfromfiles\app
|
12 |
+
|
13 |
+
# Now "uploads" will be: C:\Users\sharm\Documents\projects\VectorDBConversionfromfiles\app\uploads
|
14 |
+
v_uploads_folder = os.path.join(v_base_dir, 'uploads')
|
15 |
+
v_app.config['UPLOAD_FOLDER'] = v_uploads_folder
|
16 |
+
|
17 |
v_app.config['ALLOWED_EXTENSIONS'] = {'zip'}
|
18 |
|
19 |
with v_app.app_context():
|
app/__pycache__/__init__.cpython-310.pyc
DELETED
Binary file (564 Bytes)
|
|
app/__pycache__/routes.cpython-310.pyc
DELETED
Binary file (1.1 kB)
|
|
app/routes.py
CHANGED
@@ -1,38 +1,72 @@
|
|
1 |
import os
|
2 |
-
from flask import Blueprint, render_template, request, send_file, jsonify
|
3 |
-
from .utils.vector_db import process_files_to_vectors
|
4 |
-
from .utils.zip_handler import handle_zip_upload # We'll create this utility
|
5 |
import zipfile
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
v_bp = Blueprint('routes', __name__)
|
8 |
|
9 |
@v_bp.route('/', methods=['GET', 'POST'])
|
10 |
def home():
|
11 |
if request.method == 'POST':
|
|
|
12 |
v_uploaded_file = request.files.get('file')
|
13 |
if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
|
14 |
-
return jsonify({'error': 'Please upload a valid zip file.'}), 400
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
#
|
17 |
-
|
|
|
|
|
|
|
|
|
18 |
v_uploaded_file.save(v_upload_path)
|
19 |
|
20 |
-
# Extract the ZIP
|
21 |
-
|
|
|
|
|
|
|
22 |
|
23 |
-
#
|
24 |
-
|
|
|
|
|
|
|
25 |
|
26 |
-
# Zip the
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
35 |
|
36 |
-
|
|
|
37 |
|
|
|
38 |
return render_template('index.html')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
|
|
|
|
2 |
import zipfile
|
3 |
+
import shutil
|
4 |
+
import datetime
|
5 |
+
|
6 |
+
from flask import Blueprint, render_template, request, send_file, jsonify, current_app
|
7 |
+
from .utils.zip_handler import handle_zip_upload
|
8 |
+
from .utils.vector_db import process_files_to_vectors
|
9 |
|
10 |
v_bp = Blueprint('routes', __name__)
|
11 |
|
12 |
@v_bp.route('/', methods=['GET', 'POST'])
|
13 |
def home():
|
14 |
if request.method == 'POST':
|
15 |
+
# 1. Validate the uploaded file
|
16 |
v_uploaded_file = request.files.get('file')
|
17 |
if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
|
18 |
+
return jsonify({'error': 'Please upload a valid zip file (.zip).'}), 400
|
19 |
+
|
20 |
+
# 2. Get the uploads folder from Flask config
|
21 |
+
v_uploads_folder = current_app.config['UPLOAD_FOLDER']
|
22 |
+
# Example: "app/uploads"
|
23 |
|
24 |
+
# 3. Clean up the existing contents in uploads folder
|
25 |
+
cleanup_uploads_folder(v_uploads_folder)
|
26 |
+
os.makedirs(v_uploads_folder, exist_ok=True)
|
27 |
+
|
28 |
+
# 4. Save the new ZIP file into the uploads folder
|
29 |
+
v_upload_path = os.path.join(v_uploads_folder, v_uploaded_file.filename)
|
30 |
v_uploaded_file.save(v_upload_path)
|
31 |
|
32 |
+
# 5. Extract the ZIP file (directly into app/uploads)
|
33 |
+
handle_zip_upload(v_upload_path)
|
34 |
+
|
35 |
+
# 6. Process all extracted files to create/update vector DB in app/uploads/vectors
|
36 |
+
process_files_to_vectors(v_uploads_folder)
|
37 |
|
38 |
+
# 7. Build a timestamped filename for the final zip
|
39 |
+
v_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
40 |
+
v_zip_filename = f"uploads_all_{v_timestamp}.zip"
|
41 |
+
# Final zip will be inside app/uploads
|
42 |
+
v_final_zip_path = os.path.join(v_uploads_folder, v_zip_filename)
|
43 |
|
44 |
+
# 8. Zip the entire app/uploads folder (skip the final zip to prevent recursion)
|
45 |
+
with zipfile.ZipFile(v_final_zip_path, 'w', zipfile.ZIP_DEFLATED) as obj_zip:
|
46 |
+
for v_root, _, v_files in os.walk(v_uploads_folder):
|
47 |
+
for v_file in v_files:
|
48 |
+
# Exclude the new zip itself from being re-zipped
|
49 |
+
if v_file == v_zip_filename:
|
50 |
+
continue
|
51 |
+
v_full_path = os.path.join(v_root, v_file)
|
52 |
+
v_arcname = os.path.relpath(v_full_path, start=v_uploads_folder)
|
53 |
+
obj_zip.write(v_full_path, arcname=v_arcname)
|
54 |
|
55 |
+
# 9. Send the final zip file to the user
|
56 |
+
return send_file(v_final_zip_path, as_attachment=True)
|
57 |
|
58 |
+
# If GET, render the upload form
|
59 |
return render_template('index.html')
|
60 |
+
|
61 |
+
|
62 |
+
def cleanup_uploads_folder(v_folder_path):
|
63 |
+
"""
|
64 |
+
Deletes all files/folders inside v_folder_path.
|
65 |
+
"""
|
66 |
+
if os.path.exists(v_folder_path):
|
67 |
+
for v_item in os.listdir(v_folder_path):
|
68 |
+
v_item_path = os.path.join(v_folder_path, v_item)
|
69 |
+
if os.path.isfile(v_item_path):
|
70 |
+
os.remove(v_item_path)
|
71 |
+
elif os.path.isdir(v_item_path):
|
72 |
+
shutil.rmtree(v_item_path)
|
app/templates/index.html
CHANGED
@@ -3,50 +3,19 @@
|
|
3 |
<head>
|
4 |
<meta charset="UTF-8" />
|
5 |
<title>Vector DB Creator</title>
|
6 |
-
<style>
|
7 |
-
body {
|
8 |
-
font-family: Arial, sans-serif;
|
9 |
-
margin: 40px;
|
10 |
-
}
|
11 |
-
header {
|
12 |
-
display: flex;
|
13 |
-
align-items: center;
|
14 |
-
}
|
15 |
-
header img {
|
16 |
-
margin-right: 20px;
|
17 |
-
width: 50px;
|
18 |
-
}
|
19 |
-
h1 {
|
20 |
-
margin: 0;
|
21 |
-
}
|
22 |
-
.content {
|
23 |
-
margin-top: 20px;
|
24 |
-
}
|
25 |
-
form {
|
26 |
-
margin-top: 20px;
|
27 |
-
}
|
28 |
-
label {
|
29 |
-
display: inline-block;
|
30 |
-
width: 120px;
|
31 |
-
}
|
32 |
-
</style>
|
33 |
</head>
|
34 |
<body>
|
35 |
-
<
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
<
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
</form>
|
48 |
-
|
49 |
-
<p>After processing, you will be prompted to download the resulting vector database.</p>
|
50 |
-
</div>
|
51 |
</body>
|
52 |
</html>
|
|
|
3 |
<head>
|
4 |
<meta charset="UTF-8" />
|
5 |
<title>Vector DB Creator</title>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
</head>
|
7 |
<body>
|
8 |
+
<h1>Upload Your .ZIP File</h1>
|
9 |
+
<p>
|
10 |
+
On upload, the existing <strong>uploads</strong> folder is cleared.
|
11 |
+
Then, your new files are extracted, vector DB is created,
|
12 |
+
and finally, we zip the entire <strong>uploads</strong> folder
|
13 |
+
with a timestamped name.
|
14 |
+
</p>
|
15 |
+
<form action="/" method="POST" enctype="multipart/form-data">
|
16 |
+
<label for="file">Choose ZIP File:</label>
|
17 |
+
<input type="file" name="file" id="file" accept=".zip" required>
|
18 |
+
<button type="submit">Upload & Process</button>
|
19 |
+
</form>
|
|
|
|
|
|
|
|
|
20 |
</body>
|
21 |
</html>
|
app/utils/__pycache__/vector_db.cpython-310.pyc
DELETED
Binary file (1.92 kB)
|
|
app/utils/__pycache__/zip_handler.cpython-310.pyc
DELETED
Binary file (494 Bytes)
|
|
app/utils/file_handler.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import os
|
2 |
import fitz # PyMuPDF
|
3 |
import pandas as pd
|
4 |
from pptx import Presentation
|
@@ -9,14 +8,12 @@ def extract_text_from_file(v_file_path):
|
|
9 |
"""
|
10 |
v_text = ""
|
11 |
|
12 |
-
# PDF
|
13 |
if v_file_path.lower().endswith('.pdf'):
|
14 |
obj_pdf = fitz.open(v_file_path)
|
15 |
for obj_page in obj_pdf:
|
16 |
v_text += obj_page.get_text()
|
17 |
obj_pdf.close()
|
18 |
|
19 |
-
# PPTX
|
20 |
elif v_file_path.lower().endswith('.pptx'):
|
21 |
obj_ppt = Presentation(v_file_path)
|
22 |
for obj_slide in obj_ppt.slides:
|
@@ -24,7 +21,6 @@ def extract_text_from_file(v_file_path):
|
|
24 |
if obj_shape.has_text_frame:
|
25 |
v_text += obj_shape.text_frame.text + "\n"
|
26 |
|
27 |
-
# CSV
|
28 |
elif v_file_path.lower().endswith('.csv'):
|
29 |
v_data = pd.read_csv(v_file_path)
|
30 |
v_text += v_data.to_string()
|
|
|
|
|
1 |
import fitz # PyMuPDF
|
2 |
import pandas as pd
|
3 |
from pptx import Presentation
|
|
|
8 |
"""
|
9 |
v_text = ""
|
10 |
|
|
|
11 |
if v_file_path.lower().endswith('.pdf'):
|
12 |
obj_pdf = fitz.open(v_file_path)
|
13 |
for obj_page in obj_pdf:
|
14 |
v_text += obj_page.get_text()
|
15 |
obj_pdf.close()
|
16 |
|
|
|
17 |
elif v_file_path.lower().endswith('.pptx'):
|
18 |
obj_ppt = Presentation(v_file_path)
|
19 |
for obj_slide in obj_ppt.slides:
|
|
|
21 |
if obj_shape.has_text_frame:
|
22 |
v_text += obj_shape.text_frame.text + "\n"
|
23 |
|
|
|
24 |
elif v_file_path.lower().endswith('.csv'):
|
25 |
v_data = pd.read_csv(v_file_path)
|
26 |
v_text += v_data.to_string()
|
app/utils/vector_db.py
CHANGED
@@ -5,50 +5,48 @@ import torch
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
from .file_handler import extract_text_from_file
|
7 |
|
8 |
-
# Determine if GPU is available
|
9 |
v_device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
|
11 |
|
12 |
def process_files_to_vectors(v_folder_path):
|
13 |
"""
|
14 |
-
Processes files (PDF, PPTX, CSV) to
|
15 |
-
|
16 |
"""
|
17 |
v_vector_folder = os.path.join(v_folder_path, 'vectors')
|
18 |
os.makedirs(v_vector_folder, exist_ok=True)
|
19 |
|
20 |
-
# Create a
|
21 |
-
# For sentence-transformers/all-MiniLM-L6-v2, embedding dimension is 384
|
22 |
v_index = faiss.IndexFlatL2(384)
|
23 |
v_metadata = {}
|
24 |
-
|
25 |
-
# Iterate over extracted files
|
26 |
v_doc_counter = 0
|
|
|
27 |
for v_root, _, v_files in os.walk(v_folder_path):
|
28 |
for v_file in v_files:
|
29 |
v_file_path = os.path.join(v_root, v_file)
|
30 |
-
|
|
|
31 |
if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
|
32 |
v_text = extract_text_from_file(v_file_path)
|
33 |
if not v_text.strip():
|
34 |
-
continue #
|
35 |
|
36 |
-
#
|
37 |
-
# convert_to_tensor=True yields a PyTorch tensor, so convert to numpy
|
38 |
v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
|
39 |
-
|
40 |
-
# Add to FAISS index
|
41 |
v_index.add(v_embeddings)
|
42 |
|
43 |
-
#
|
44 |
-
|
|
|
|
|
|
|
45 |
v_doc_counter += 1
|
46 |
|
47 |
-
# Save FAISS index
|
48 |
v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
|
49 |
faiss.write_index(v_index, v_index_path)
|
50 |
|
51 |
-
# Save metadata
|
52 |
import json
|
53 |
with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
|
54 |
json.dump(v_metadata, obj_meta, indent=4)
|
|
|
5 |
from sentence_transformers import SentenceTransformer
|
6 |
from .file_handler import extract_text_from_file
|
7 |
|
|
|
8 |
v_device = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
|
10 |
|
11 |
def process_files_to_vectors(v_folder_path):
|
12 |
"""
|
13 |
+
Processes files (PDF, PPTX, CSV) found in v_folder_path to build a FAISS vector DB.
|
14 |
+
Stores only the reference path in metadata.json (no file name or full path).
|
15 |
"""
|
16 |
v_vector_folder = os.path.join(v_folder_path, 'vectors')
|
17 |
os.makedirs(v_vector_folder, exist_ok=True)
|
18 |
|
19 |
+
# Create a FAISS index (384 dimensions for all-MiniLM-L6-v2)
|
|
|
20 |
v_index = faiss.IndexFlatL2(384)
|
21 |
v_metadata = {}
|
|
|
|
|
22 |
v_doc_counter = 0
|
23 |
+
|
24 |
for v_root, _, v_files in os.walk(v_folder_path):
|
25 |
for v_file in v_files:
|
26 |
v_file_path = os.path.join(v_root, v_file)
|
27 |
+
|
28 |
+
# Filter files by extension
|
29 |
if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
|
30 |
v_text = extract_text_from_file(v_file_path)
|
31 |
if not v_text.strip():
|
32 |
+
continue # skip empty files
|
33 |
|
34 |
+
# Convert text to embeddings
|
|
|
35 |
v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
|
|
|
|
|
36 |
v_index.add(v_embeddings)
|
37 |
|
38 |
+
# Generate a relative path (reference path only)
|
39 |
+
v_reference_path = os.path.relpath(v_file_path, start=v_folder_path)
|
40 |
+
|
41 |
+
# Store only the reference path in metadata
|
42 |
+
v_metadata[v_doc_counter] = v_reference_path
|
43 |
v_doc_counter += 1
|
44 |
|
45 |
+
# Save the FAISS index
|
46 |
v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
|
47 |
faiss.write_index(v_index, v_index_path)
|
48 |
|
49 |
+
# Save metadata (containing only reference paths)
|
50 |
import json
|
51 |
with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
|
52 |
json.dump(v_metadata, obj_meta, indent=4)
|
app/utils/zip_handler.py
CHANGED
@@ -3,9 +3,10 @@ import zipfile
|
|
3 |
|
4 |
def handle_zip_upload(v_zip_path):
|
5 |
"""
|
6 |
-
Extracts ZIP
|
|
|
7 |
"""
|
8 |
-
|
9 |
with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
|
10 |
-
obj_zip.extractall(
|
11 |
-
return
|
|
|
3 |
|
4 |
def handle_zip_upload(v_zip_path):
|
5 |
"""
|
6 |
+
Extracts the ZIP into the same folder as v_zip_path.
|
7 |
+
If v_zip_path = 'app/uploads/myfile.zip', extracts into 'app/uploads'.
|
8 |
"""
|
9 |
+
v_parent_folder = os.path.dirname(v_zip_path)
|
10 |
with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
|
11 |
+
obj_zip.extractall(v_parent_folder)
|
12 |
+
return v_parent_folder
|
run.py
CHANGED
@@ -2,5 +2,6 @@ from app import create_app
|
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
obj_app = create_app()
|
5 |
-
#
|
6 |
-
|
|
|
|
2 |
|
3 |
if __name__ == "__main__":
|
4 |
obj_app = create_app()
|
5 |
+
# Host 0.0.0.0 so it's accessible in Docker or on local network
|
6 |
+
# Port 7860 is an example; you can choose another
|
7 |
+
obj_app.run(host='0.0.0.0', port=7860, debug=False)
|