vishalsh13 commited on
Commit
e4dba65
·
1 Parent(s): ecf18d6

commit as its working on local

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv\
2
+ .venv
3
+ _pycache_
app/__init__.py CHANGED
@@ -1,8 +1,19 @@
 
 
1
  from flask import Flask
2
 
3
  def create_app():
4
  v_app = Flask(__name__)
5
- v_app.config['UPLOAD_FOLDER'] = 'app/uploads/'
 
 
 
 
 
 
 
 
 
6
  v_app.config['ALLOWED_EXTENSIONS'] = {'zip'}
7
 
8
  with v_app.app_context():
 
1
+ # app/__init__.py
2
+ import os
3
  from flask import Flask
4
 
5
  def create_app():
6
  v_app = Flask(__name__)
7
+
8
+ # Get the absolute path of the "app" folder
9
+ v_base_dir = os.path.dirname(os.path.abspath(__file__))
10
+ # This should point to your "app" folder, for example:
11
+ # C:\Users\sharm\Documents\projects\VectorDBConversionfromfiles\app
12
+
13
+ # Now "uploads" will be: C:\Users\sharm\Documents\projects\VectorDBConversionfromfiles\app\uploads
14
+ v_uploads_folder = os.path.join(v_base_dir, 'uploads')
15
+ v_app.config['UPLOAD_FOLDER'] = v_uploads_folder
16
+
17
  v_app.config['ALLOWED_EXTENSIONS'] = {'zip'}
18
 
19
  with v_app.app_context():
app/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (564 Bytes)
 
app/__pycache__/routes.cpython-310.pyc DELETED
Binary file (1.1 kB)
 
app/routes.py CHANGED
@@ -1,38 +1,72 @@
1
  import os
2
- from flask import Blueprint, render_template, request, send_file, jsonify
3
- from .utils.vector_db import process_files_to_vectors
4
- from .utils.zip_handler import handle_zip_upload # We'll create this utility
5
  import zipfile
 
 
 
 
 
 
6
 
7
  v_bp = Blueprint('routes', __name__)
8
 
9
  @v_bp.route('/', methods=['GET', 'POST'])
10
  def home():
11
  if request.method == 'POST':
 
12
  v_uploaded_file = request.files.get('file')
13
  if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
14
- return jsonify({'error': 'Please upload a valid zip file.'}), 400
 
 
 
 
15
 
16
- # Save uploaded ZIP
17
- v_upload_path = os.path.join('app/uploads', v_uploaded_file.filename)
 
 
 
 
18
  v_uploaded_file.save(v_upload_path)
19
 
20
- # Extract the ZIP
21
- v_extracted_folder = handle_zip_upload(v_upload_path)
 
 
 
22
 
23
- # Process to create or update vector DB
24
- v_result_folder = process_files_to_vectors(v_extracted_folder)
 
 
 
25
 
26
- # Zip the resulting vectors folder for download
27
- v_result_zip_path = os.path.join('app/uploads', 'vector_db_result.zip')
28
- obj_zip = zipfile.ZipFile(v_result_zip_path, 'w', zipfile.ZIP_DEFLATED)
29
- for v_root, _, v_files in os.walk(v_result_folder):
30
- for v_file in v_files:
31
- v_full_path = os.path.join(v_root, v_file)
32
- v_arcname = os.path.relpath(v_full_path, start=v_result_folder)
33
- obj_zip.write(v_full_path, arcname=v_arcname)
34
- obj_zip.close()
 
35
 
36
- return send_file(v_result_zip_path, as_attachment=True)
 
37
 
 
38
  return render_template('index.html')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
 
2
  import zipfile
3
+ import shutil
4
+ import datetime
5
+
6
+ from flask import Blueprint, render_template, request, send_file, jsonify, current_app
7
+ from .utils.zip_handler import handle_zip_upload
8
+ from .utils.vector_db import process_files_to_vectors
9
 
10
  v_bp = Blueprint('routes', __name__)
11
 
12
  @v_bp.route('/', methods=['GET', 'POST'])
13
  def home():
14
  if request.method == 'POST':
15
+ # 1. Validate the uploaded file
16
  v_uploaded_file = request.files.get('file')
17
  if not v_uploaded_file or not v_uploaded_file.filename.endswith('.zip'):
18
+ return jsonify({'error': 'Please upload a valid zip file (.zip).'}), 400
19
+
20
+ # 2. Get the uploads folder from Flask config
21
+ v_uploads_folder = current_app.config['UPLOAD_FOLDER']
22
+ # Example: "app/uploads"
23
 
24
+ # 3. Clean up the existing contents in uploads folder
25
+ cleanup_uploads_folder(v_uploads_folder)
26
+ os.makedirs(v_uploads_folder, exist_ok=True)
27
+
28
+ # 4. Save the new ZIP file into the uploads folder
29
+ v_upload_path = os.path.join(v_uploads_folder, v_uploaded_file.filename)
30
  v_uploaded_file.save(v_upload_path)
31
 
32
+ # 5. Extract the ZIP file (directly into app/uploads)
33
+ handle_zip_upload(v_upload_path)
34
+
35
+ # 6. Process all extracted files to create/update vector DB in app/uploads/vectors
36
+ process_files_to_vectors(v_uploads_folder)
37
 
38
+ # 7. Build a timestamped filename for the final zip
39
+ v_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
40
+ v_zip_filename = f"uploads_all_{v_timestamp}.zip"
41
+ # Final zip will be inside app/uploads
42
+ v_final_zip_path = os.path.join(v_uploads_folder, v_zip_filename)
43
 
44
+ # 8. Zip the entire app/uploads folder (skip the final zip to prevent recursion)
45
+ with zipfile.ZipFile(v_final_zip_path, 'w', zipfile.ZIP_DEFLATED) as obj_zip:
46
+ for v_root, _, v_files in os.walk(v_uploads_folder):
47
+ for v_file in v_files:
48
+ # Exclude the new zip itself from being re-zipped
49
+ if v_file == v_zip_filename:
50
+ continue
51
+ v_full_path = os.path.join(v_root, v_file)
52
+ v_arcname = os.path.relpath(v_full_path, start=v_uploads_folder)
53
+ obj_zip.write(v_full_path, arcname=v_arcname)
54
 
55
+ # 9. Send the final zip file to the user
56
+ return send_file(v_final_zip_path, as_attachment=True)
57
 
58
+ # If GET, render the upload form
59
  return render_template('index.html')
60
+
61
+
62
+ def cleanup_uploads_folder(v_folder_path):
63
+ """
64
+ Deletes all files/folders inside v_folder_path.
65
+ """
66
+ if os.path.exists(v_folder_path):
67
+ for v_item in os.listdir(v_folder_path):
68
+ v_item_path = os.path.join(v_folder_path, v_item)
69
+ if os.path.isfile(v_item_path):
70
+ os.remove(v_item_path)
71
+ elif os.path.isdir(v_item_path):
72
+ shutil.rmtree(v_item_path)
app/templates/index.html CHANGED
@@ -3,50 +3,19 @@
3
  <head>
4
  <meta charset="UTF-8" />
5
  <title>Vector DB Creator</title>
6
- <style>
7
- body {
8
- font-family: Arial, sans-serif;
9
- margin: 40px;
10
- }
11
- header {
12
- display: flex;
13
- align-items: center;
14
- }
15
- header img {
16
- margin-right: 20px;
17
- width: 50px;
18
- }
19
- h1 {
20
- margin: 0;
21
- }
22
- .content {
23
- margin-top: 20px;
24
- }
25
- form {
26
- margin-top: 20px;
27
- }
28
- label {
29
- display: inline-block;
30
- width: 120px;
31
- }
32
- </style>
33
  </head>
34
  <body>
35
- <header>
36
- <img src="{{ url_for('static', filename='logo.png') }}" alt="Logo" />
37
- <h1>Change your PDF, PPT, and CSV data to Vector DB</h1>
38
- </header>
39
- <div class="content">
40
- <p>This application allows you to upload a .zip containing your data files (PDF, PPTX, or CSV)
41
- and convert them into a vector database. Then you can download the processed vector DB as a .zip.</p>
42
-
43
- <form action="/" method="POST" enctype="multipart/form-data">
44
- <label for="file">Upload ZIP File:</label>
45
- <input type="file" name="file" id="file" accept=".zip" required />
46
- <button type="submit">Upload & Convert</button>
47
- </form>
48
-
49
- <p>After processing, you will be prompted to download the resulting vector database.</p>
50
- </div>
51
  </body>
52
  </html>
 
3
  <head>
4
  <meta charset="UTF-8" />
5
  <title>Vector DB Creator</title>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  </head>
7
  <body>
8
+ <h1>Upload Your .ZIP File</h1>
9
+ <p>
10
+ On upload, the existing <strong>uploads</strong> folder is cleared.
11
+ Then, your new files are extracted, vector DB is created,
12
+ and finally, we zip the entire <strong>uploads</strong> folder
13
+ with a timestamped name.
14
+ </p>
15
+ <form action="/" method="POST" enctype="multipart/form-data">
16
+ <label for="file">Choose ZIP File:</label>
17
+ <input type="file" name="file" id="file" accept=".zip" required>
18
+ <button type="submit">Upload & Process</button>
19
+ </form>
 
 
 
 
20
  </body>
21
  </html>
app/utils/__pycache__/vector_db.cpython-310.pyc DELETED
Binary file (1.92 kB)
 
app/utils/__pycache__/zip_handler.cpython-310.pyc DELETED
Binary file (494 Bytes)
 
app/utils/file_handler.py CHANGED
@@ -1,4 +1,3 @@
1
- import os
2
  import fitz # PyMuPDF
3
  import pandas as pd
4
  from pptx import Presentation
@@ -9,14 +8,12 @@ def extract_text_from_file(v_file_path):
9
  """
10
  v_text = ""
11
 
12
- # PDF
13
  if v_file_path.lower().endswith('.pdf'):
14
  obj_pdf = fitz.open(v_file_path)
15
  for obj_page in obj_pdf:
16
  v_text += obj_page.get_text()
17
  obj_pdf.close()
18
 
19
- # PPTX
20
  elif v_file_path.lower().endswith('.pptx'):
21
  obj_ppt = Presentation(v_file_path)
22
  for obj_slide in obj_ppt.slides:
@@ -24,7 +21,6 @@ def extract_text_from_file(v_file_path):
24
  if obj_shape.has_text_frame:
25
  v_text += obj_shape.text_frame.text + "\n"
26
 
27
- # CSV
28
  elif v_file_path.lower().endswith('.csv'):
29
  v_data = pd.read_csv(v_file_path)
30
  v_text += v_data.to_string()
 
 
1
  import fitz # PyMuPDF
2
  import pandas as pd
3
  from pptx import Presentation
 
8
  """
9
  v_text = ""
10
 
 
11
  if v_file_path.lower().endswith('.pdf'):
12
  obj_pdf = fitz.open(v_file_path)
13
  for obj_page in obj_pdf:
14
  v_text += obj_page.get_text()
15
  obj_pdf.close()
16
 
 
17
  elif v_file_path.lower().endswith('.pptx'):
18
  obj_ppt = Presentation(v_file_path)
19
  for obj_slide in obj_ppt.slides:
 
21
  if obj_shape.has_text_frame:
22
  v_text += obj_shape.text_frame.text + "\n"
23
 
 
24
  elif v_file_path.lower().endswith('.csv'):
25
  v_data = pd.read_csv(v_file_path)
26
  v_text += v_data.to_string()
app/utils/vector_db.py CHANGED
@@ -5,50 +5,48 @@ import torch
5
  from sentence_transformers import SentenceTransformer
6
  from .file_handler import extract_text_from_file
7
 
8
- # Determine if GPU is available
9
  v_device = "cuda" if torch.cuda.is_available() else "cpu"
10
  obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
11
 
12
  def process_files_to_vectors(v_folder_path):
13
  """
14
- Processes files (PDF, PPTX, CSV) to create/update a FAISS vector database.
15
- Returns the path to the folder containing the FAISS index and metadata.
16
  """
17
  v_vector_folder = os.path.join(v_folder_path, 'vectors')
18
  os.makedirs(v_vector_folder, exist_ok=True)
19
 
20
- # Create a brand-new FAISS index
21
- # For sentence-transformers/all-MiniLM-L6-v2, embedding dimension is 384
22
  v_index = faiss.IndexFlatL2(384)
23
  v_metadata = {}
24
-
25
- # Iterate over extracted files
26
  v_doc_counter = 0
 
27
  for v_root, _, v_files in os.walk(v_folder_path):
28
  for v_file in v_files:
29
  v_file_path = os.path.join(v_root, v_file)
30
-
 
31
  if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
32
  v_text = extract_text_from_file(v_file_path)
33
  if not v_text.strip():
34
- continue # Skip empty content
35
 
36
- # Encode text into embeddings
37
- # convert_to_tensor=True yields a PyTorch tensor, so convert to numpy
38
  v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
39
-
40
- # Add to FAISS index
41
  v_index.add(v_embeddings)
42
 
43
- # Map index ID to filename
44
- v_metadata[v_doc_counter] = v_file_path
 
 
 
45
  v_doc_counter += 1
46
 
47
- # Save FAISS index
48
  v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
49
  faiss.write_index(v_index, v_index_path)
50
 
51
- # Save metadata
52
  import json
53
  with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
54
  json.dump(v_metadata, obj_meta, indent=4)
 
5
  from sentence_transformers import SentenceTransformer
6
  from .file_handler import extract_text_from_file
7
 
 
8
  v_device = "cuda" if torch.cuda.is_available() else "cpu"
9
  obj_embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=v_device)
10
 
11
  def process_files_to_vectors(v_folder_path):
12
  """
13
+ Processes files (PDF, PPTX, CSV) found in v_folder_path to build a FAISS vector DB.
14
+ Stores only the reference path in metadata.json (no file name or full path).
15
  """
16
  v_vector_folder = os.path.join(v_folder_path, 'vectors')
17
  os.makedirs(v_vector_folder, exist_ok=True)
18
 
19
+ # Create a FAISS index (384 dimensions for all-MiniLM-L6-v2)
 
20
  v_index = faiss.IndexFlatL2(384)
21
  v_metadata = {}
 
 
22
  v_doc_counter = 0
23
+
24
  for v_root, _, v_files in os.walk(v_folder_path):
25
  for v_file in v_files:
26
  v_file_path = os.path.join(v_root, v_file)
27
+
28
+ # Filter files by extension
29
  if v_file_path.lower().endswith(('.pdf', '.pptx', '.csv')):
30
  v_text = extract_text_from_file(v_file_path)
31
  if not v_text.strip():
32
+ continue # skip empty files
33
 
34
+ # Convert text to embeddings
 
35
  v_embeddings = obj_embedding_model.encode([v_text], convert_to_tensor=True).cpu().numpy()
 
 
36
  v_index.add(v_embeddings)
37
 
38
+ # Generate a relative path (reference path only)
39
+ v_reference_path = os.path.relpath(v_file_path, start=v_folder_path)
40
+
41
+ # Store only the reference path in metadata
42
+ v_metadata[v_doc_counter] = v_reference_path
43
  v_doc_counter += 1
44
 
45
+ # Save the FAISS index
46
  v_index_path = os.path.join(v_vector_folder, 'vector_index.faiss')
47
  faiss.write_index(v_index, v_index_path)
48
 
49
+ # Save metadata (containing only reference paths)
50
  import json
51
  with open(os.path.join(v_vector_folder, 'metadata.json'), 'w', encoding='utf-8') as obj_meta:
52
  json.dump(v_metadata, obj_meta, indent=4)
app/utils/zip_handler.py CHANGED
@@ -3,9 +3,10 @@ import zipfile
3
 
4
  def handle_zip_upload(v_zip_path):
5
  """
6
- Extracts ZIP file contents into a subfolder of app/uploads.
 
7
  """
8
- v_extracted_path = os.path.splitext(v_zip_path)[0]
9
  with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
10
- obj_zip.extractall(v_extracted_path)
11
- return v_extracted_path
 
3
 
4
  def handle_zip_upload(v_zip_path):
5
  """
6
+ Extracts the ZIP into the same folder as v_zip_path.
7
+ If v_zip_path = 'app/uploads/myfile.zip', extracts into 'app/uploads'.
8
  """
9
+ v_parent_folder = os.path.dirname(v_zip_path)
10
  with zipfile.ZipFile(v_zip_path, 'r') as obj_zip:
11
+ obj_zip.extractall(v_parent_folder)
12
+ return v_parent_folder
run.py CHANGED
@@ -2,5 +2,6 @@ from app import create_app
2
 
3
  if __name__ == "__main__":
4
  obj_app = create_app()
5
- # Run the Flask app on 0.0.0.0:7860 for Hugging Face Spaces or local Docker
6
- obj_app.run(host='0.0.0.0', port=7860)
 
 
2
 
3
  if __name__ == "__main__":
4
  obj_app = create_app()
5
+ # Host 0.0.0.0 so it's accessible in Docker or on local network
6
+ # Port 7860 is an example; you can choose another
7
+ obj_app.run(host='0.0.0.0', port=7860, debug=False)