File size: 4,725 Bytes
97208ad
 
 
 
 
 
 
 
 
 
 
8e73170
97208ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e73170
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import pickle
from modules.google_cloud_ocr.google_cloud_ocr import google_cloud_ocr
from modules.deed_preprocessing.spellcheck import correct_spelling
from modules.deed_preprocessing.preprocessor import preprocess_text
from modules.openai.racist_chatgpt_analysis import racist_chatgpt_analysis
from modules.model_experimentation.bag_of_words_logistic_regression import predict
import pandas as pd
import xlsxwriter
import re

app = Flask(__name__)
# CORS(app, resources={r"/*": {"origins": "*"}})
CORS(app, supports_credentials=True, origins="*")

with open('modules/model_experimentation/vectorizer.pkl', 'rb') as vec_file:
    vectorizer = pickle.load(vec_file)

with open('modules/model_experimentation/logistic_model.pkl', 'rb') as model_file:
    logistic_model = pickle.load(model_file)

# Helper to look for the book and page numbers
def extract_book_and_page(text):
    book_numbers = re.findall(r"book\s+(\d+)", text, re.IGNORECASE)
    page_numbers = re.findall(r"page\s+(\d+)", text, re.IGNORECASE)
    return book_numbers, page_numbers

@app.route('/api/upload', methods=['POST'])
def upload_file():
    if 'file' not in request.files:
        return jsonify({'error': 'No file part in the request'}), 400

    file = request.files['file']

    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    ocr_engine = request.form.get('ocr_engine', 'google')
    analysis_method = request.form.get('analysis_method', 'chatgpt')

    try:
        if ocr_engine == 'google':
            # Step 1: Get text using Google OCR
            google_text = google_cloud_ocr(file)

            # Step 2: Pass text through the spell checker
            spellchecked_text = correct_spelling(google_text)

            # Step 3: Pass text through the preprocessor
            processed_text = preprocess_text(spellchecked_text)

            # Extract book and page numbers right after spellchecking
            book_numbers, page_numbers = extract_book_and_page(spellchecked_text)

            # Step 4: Get the names and locations
            extracted_info = {
                "names": processed_text.get("names", []),
                "locations": processed_text.get("locations", []),
                "book_numbers": book_numbers,  
                "page_numbers": page_numbers
            }
            
            # Step 5: Choose analysis method
            if analysis_method == 'chatgpt':
                analysis_result = racist_chatgpt_analysis(processed_text['original_text'])
                return jsonify({
                    'status': 'success',
                    'ocr_engine': 'google',
                    'analysis_method': 'chatgpt',
                    'original_text': google_text,
                    'spellchecked_text': spellchecked_text,
                    'processed_text': processed_text,
                    'extracted_info': extracted_info,
                    'result': analysis_result
                }), 200
            elif analysis_method == 'logistic_regression':
                lr_result = predict(processed_text, vectorizer, logistic_model)['is_racist']
                return jsonify({
                    'status': 'success',
                    'ocr_engine': 'google',
                    'analysis_method': 'logistic_regression',
                    'original_text': google_text,
                    'spellchecked_text': spellchecked_text,
                    'processed_text': processed_text,
                    'extracted_info': extracted_info,
                    'result': lr_result
                }), 200
            else:
                return jsonify({'error': 'Unsupported analysis method selected'}), 400
        elif ocr_engine == 'azure':
            return jsonify({'status': 'success', 'ocr_engine': 'azure', 'text': "fill"}), 200
        else:
            return jsonify({'error': 'Unsupported OCR engine selected'}), 400
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/api/download_excel', methods=['POST'])
def download_excel():
    try:
        data = request.get_json()
        if not data:
            return jsonify({'error': 'No data provided'}), 400

        df = pd.DataFrame(data)
        excel_path = 'output.xlsx'
        with pd.ExcelWriter(excel_path, engine='xlsxwriter') as writer:
            df.to_excel(writer, index=False, sheet_name='Sheet1')

        return send_file(excel_path, as_attachment=True, download_name='analysis_results.xlsx')
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, host="0.0.0.0", port=7860)