Spaces:
Sleeping
Sleeping
Update app.py
Browse filesUpdated App.py
app.py
CHANGED
@@ -40,6 +40,22 @@ def init_qwen_model():
|
|
40 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
41 |
return model.eval(), processor
|
42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# Extract text using GOT
|
44 |
def extract_text_got(image_file, model, tokenizer):
|
45 |
return model.chat(tokenizer, image_file, ocr_type='ocr')
|
@@ -57,13 +73,6 @@ def extract_text_qwen(image_file, model, processor):
|
|
57 |
except Exception as e:
|
58 |
return f"An error occurred: {str(e)}"
|
59 |
|
60 |
-
# Text Cleaning AI - Clean spaces, handle dual languages
|
61 |
-
def clean_extracted_text(text):
|
62 |
-
# Remove extra spaces
|
63 |
-
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
64 |
-
cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
|
65 |
-
return cleaned_text
|
66 |
-
|
67 |
# Highlight keyword search
|
68 |
def highlight_text(text, search_term):
|
69 |
if not search_term:
|
@@ -126,19 +135,25 @@ if predict_button and uploaded_file:
|
|
126 |
# Clean extracted text
|
127 |
cleaned_text = clean_extracted_text(extracted_text)
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
# Delete temp file
|
130 |
if os.path.exists(temp_file_path):
|
131 |
os.remove(temp_file_path)
|
132 |
|
133 |
# Display extracted text and search functionality
|
134 |
-
st.subheader("Extracted Text (Cleaned)")
|
135 |
-
st.markdown(
|
136 |
|
137 |
search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
|
138 |
if search_query:
|
139 |
-
highlighted_text = highlight_text(
|
140 |
st.markdown("### Highlighted Search Results:")
|
141 |
st.markdown(highlighted_text, unsafe_allow_html=True)
|
142 |
else:
|
143 |
st.markdown("### Extracted Text:")
|
144 |
-
st.markdown(
|
|
|
40 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
|
41 |
return model.eval(), processor
|
42 |
|
43 |
+
# Text Cleaning AI - Clean spaces, handle dual languages
|
44 |
+
def clean_extracted_text(text):
|
45 |
+
# Remove extra spaces
|
46 |
+
cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
47 |
+
cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
|
48 |
+
return cleaned_text
|
49 |
+
|
50 |
+
# Polish the text using a model
|
51 |
+
def polish_text_with_ai(cleaned_text, model, tokenizer):
|
52 |
+
# Use a text generation pipeline for better language flow
|
53 |
+
from transformers import pipeline
|
54 |
+
nlp = pipeline('text-generation', model=model, tokenizer=tokenizer)
|
55 |
+
prompt = f"Correct and clean the following text: '{cleaned_text}' and make it meaningful."
|
56 |
+
polished_text = nlp(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
|
57 |
+
return polished_text
|
58 |
+
|
59 |
# Extract text using GOT
|
60 |
def extract_text_got(image_file, model, tokenizer):
|
61 |
return model.chat(tokenizer, image_file, ocr_type='ocr')
|
|
|
73 |
except Exception as e:
|
74 |
return f"An error occurred: {str(e)}"
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
# Highlight keyword search
|
77 |
def highlight_text(text, search_term):
|
78 |
if not search_term:
|
|
|
135 |
# Clean extracted text
|
136 |
cleaned_text = clean_extracted_text(extracted_text)
|
137 |
|
138 |
+
# Optionally, polish text with AI model for better language flow
|
139 |
+
if model_choice in ["GOT_CPU", "GOT_GPU"]:
|
140 |
+
polished_text = polish_text_with_ai(cleaned_text, got_model, tokenizer)
|
141 |
+
else:
|
142 |
+
polished_text = cleaned_text
|
143 |
+
|
144 |
# Delete temp file
|
145 |
if os.path.exists(temp_file_path):
|
146 |
os.remove(temp_file_path)
|
147 |
|
148 |
# Display extracted text and search functionality
|
149 |
+
st.subheader("Extracted Text (Cleaned & Polished)")
|
150 |
+
st.markdown(polished_text, unsafe_allow_html=True)
|
151 |
|
152 |
search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
|
153 |
if search_query:
|
154 |
+
highlighted_text = highlight_text(polished_text, search_query)
|
155 |
st.markdown("### Highlighted Search Results:")
|
156 |
st.markdown(highlighted_text, unsafe_allow_html=True)
|
157 |
else:
|
158 |
st.markdown("### Extracted Text:")
|
159 |
+
st.markdown(polished_text, unsafe_allow_html=True)
|