Spaces:

UniquePratham
/

DualTextOCRFusion

Sleeping

App Files Files Community

UniquePratham commited on Sep 29, 2024

Commit

0efdb28

verified ·

1 Parent(s): 0c84dc0

Update app.py

Browse files

Updated App.py

Files changed (1) hide show

app.py +26 -11

app.py CHANGED Viewed

@@ -40,6 +40,22 @@ def init_qwen_model():
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     return model.eval(), processor
 # Extract text using GOT
 def extract_text_got(image_file, model, tokenizer):
     return model.chat(tokenizer, image_file, ocr_type='ocr')
@@ -57,13 +73,6 @@ def extract_text_qwen(image_file, model, processor):
     except Exception as e:
         return f"An error occurred: {str(e)}"
-# Text Cleaning AI - Clean spaces, handle dual languages
-def clean_extracted_text(text):
-    # Remove extra spaces
-    cleaned_text = re.sub(r'\s+', ' ', text).strip()
-    cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
-    return cleaned_text
 # Highlight keyword search
 def highlight_text(text, search_term):
     if not search_term:
@@ -126,19 +135,25 @@ if predict_button and uploaded_file:
         # Clean extracted text
         cleaned_text = clean_extracted_text(extracted_text)
         # Delete temp file
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         # Display extracted text and search functionality
-        st.subheader("Extracted Text (Cleaned)")
-        st.markdown(cleaned_text, unsafe_allow_html=True)
         search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
         if search_query:
-            highlighted_text = highlight_text(cleaned_text, search_query)
             st.markdown("### Highlighted Search Results:")
             st.markdown(highlighted_text, unsafe_allow_html=True)
         else:
             st.markdown("### Extracted Text:")
-            st.markdown(cleaned_text, unsafe_allow_html=True)

     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     return model.eval(), processor
+# Text Cleaning AI - Clean spaces, handle dual languages
+def clean_extracted_text(text):
+    # Remove extra spaces
+    cleaned_text = re.sub(r'\s+', ' ', text).strip()
+    cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
+    return cleaned_text
+# Polish the text using a model
+def polish_text_with_ai(cleaned_text, model, tokenizer):
+    # Use a text generation pipeline for better language flow
+    from transformers import pipeline
+    nlp = pipeline('text-generation', model=model, tokenizer=tokenizer)
+    prompt = f"Correct and clean the following text: '{cleaned_text}' and make it meaningful."
+    polished_text = nlp(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
+    return polished_text
 # Extract text using GOT
 def extract_text_got(image_file, model, tokenizer):
     return model.chat(tokenizer, image_file, ocr_type='ocr')
     except Exception as e:
         return f"An error occurred: {str(e)}"
 # Highlight keyword search
 def highlight_text(text, search_term):
     if not search_term:
         # Clean extracted text
         cleaned_text = clean_extracted_text(extracted_text)
+        # Optionally, polish text with AI model for better language flow
+        if model_choice in ["GOT_CPU", "GOT_GPU"]:
+            polished_text = polish_text_with_ai(cleaned_text, got_model, tokenizer)
+        else:
+            polished_text = cleaned_text
         # Delete temp file
         if os.path.exists(temp_file_path):
             os.remove(temp_file_path)
         # Display extracted text and search functionality
+        st.subheader("Extracted Text (Cleaned & Polished)")
+        st.markdown(polished_text, unsafe_allow_html=True)
         search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
         if search_query:
+            highlighted_text = highlight_text(polished_text, search_query)
             st.markdown("### Highlighted Search Results:")
             st.markdown(highlighted_text, unsafe_allow_html=True)
         else:
             st.markdown("### Extracted Text:")
+            st.markdown(polished_text, unsafe_allow_html=True)