Spaces:

UniquePratham
/

DualTextOCRFusion

Sleeping

App Files Files Community

UniquePratham commited on Sep 29, 2024

Commit

93bd871

verified ·

1 Parent(s): 0fbad84

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -45

app.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import streamlit as st
 from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor
-from surya.ocr import run_ocr
-from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
-from surya.model.recognition.model import load_model as load_rec_model
-from surya.model.recognition.processor import load_processor as load_rec_processor
 from PIL import Image
 import torch
 import os
@@ -15,47 +12,56 @@ from groq import Groq
 from st_keyup import st_keyup
 from st_img_pastebutton import paste
 # Page configuration
-st.set_page_config(page_title="DualTextOCRFusion", page_icon="🔍", layout="wide")
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load Surya OCR Models (English + Hindi)
-det_processor, det_model = load_det_processor(), load_det_model()
-det_model.to(device)
-rec_model, rec_processor = load_rec_model(), load_rec_processor()
-rec_model.to(device)
 # Load GOT Models
 @st.cache_resource
 def init_got_model():
-    tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
-    model = AutoModel.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
     return model.eval(), tokenizer
 @st.cache_resource
 def init_got_gpu_model():
-    tokenizer = AutoTokenizer.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True)
-    model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True, device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
     return model.eval().cuda(), tokenizer
 # Load Qwen Model
 @st.cache_resource
 def init_qwen_model():
-    model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", device_map="cpu", torch_dtype=torch.float16)
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     return model.eval(), processor
 # Text Cleaning AI - Clean spaces, handle dual languages
 def clean_extracted_text(text):
     cleaned_text = re.sub(r'\s+', ' ', text).strip()
     cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
     return cleaned_text
 # Polish the text using a model
 def polish_text_with_ai(cleaned_text):
     prompt = f"Remove unwanted spaces between and inside words to join incomplete words, creating a meaningful sentence in either Hindi, English, or Hinglish without altering any words from the given extracted text. Then, return the corrected text with adjusted spaces, keeping it as close to the original as possible, along with relevant details or insights that an AI can provide about the extracted text.  Extracted Text : {cleaned_text}"
-    client = Groq(api_key="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")
     chat_completion = client.chat.completions.create(
         messages=[
             {
@@ -80,16 +86,22 @@ def extract_text_got(image_file, model, tokenizer):
 def extract_text_qwen(image_file, model, processor):
     try:
         image = Image.open(image_file).convert('RGB')
-        conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Extract text from this image."}]}]
-        text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-        inputs = processor(text=[text_prompt], images=[image], return_tensors="pt")
         output_ids = model.generate(**inputs)
-        output_text = processor.batch_decode(output_ids, skip_special_tokens=True)
         return output_text[0] if output_text else "No text extracted from the image."
     except Exception as e:
         return f"An error occurred: {str(e)}"
 # Function to highlight the keyword in the text
 def highlight_text(text, search_term):
     if not search_term:  # If no search term is provided, return the original text
         return text
@@ -98,6 +110,7 @@ def highlight_text(text, search_term):
     # Highlight matched terms with yellow background
     return pattern.sub(lambda m: f'<span style="background-color: yellow;">{m.group()}</span>', text)
 # Title and UI
 st.title("DualTextOCRFusion - 🔍")
 st.header("OCR Application - Multimodel Support")
@@ -105,19 +118,22 @@ st.write("Upload an image for OCR using various models, with support for English
 # Sidebar Configuration
 st.sidebar.header("Configuration")
-model_choice = st.sidebar.selectbox("Select OCR Model:", ("GOT_CPU", "GOT_GPU", "Qwen", "Surya (English+Hindi)"))
 # Upload Section
-uploaded_file = st.sidebar.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
 # Input from clipboard
 # Paste image button
-image_data = paste(label="paste from clipboard",key="image_clipboard")
 if image_data is not None:
-       header, encoded = image_data.split(",", 1)
-       decoded_bytes = base64.b64decode(encoded)
-       img_stream = io.BytesIO(decoded_bytes)
-       uploaded_file=img_stream
 # Input from camera
 camera_file = st.sidebar.camera_input("Capture from Camera")
@@ -134,19 +150,22 @@ col1, col2 = st.columns([2, 1])
 if uploaded_file:
     image = Image.open(uploaded_file)
     with col1:
-        col1.image(image, caption='Uploaded Image', use_column_width=False, width=300)
     # Save uploaded image to 'images' folder
     images_dir = 'images'
     os.makedirs(images_dir, exist_ok=True)
-    image_path = os.path.join(images_dir, uploaded_file.name)
     with open(image_path, 'wb') as f:
         f.write(uploaded_file.getvalue())
     # Check if the result already exists
     results_dir = 'results'
     os.makedirs(results_dir, exist_ok=True)
-    result_path = os.path.join(results_dir, f"{uploaded_file.name}_result.json")
     # Handle predictions
     if predict_button:
@@ -158,28 +177,27 @@ if uploaded_file:
             with st.spinner("Processing..."):
                 if model_choice == "GOT_CPU":
                     got_model, tokenizer = init_got_model()
-                    extracted_text = extract_text_got(image_path, got_model, tokenizer)
                 elif model_choice == "GOT_GPU":
                     got_gpu_model, tokenizer = init_got_gpu_model()
-                    extracted_text = extract_text_got(image_path, got_gpu_model, tokenizer)
                 elif model_choice == "Qwen":
                     qwen_model, qwen_processor = init_qwen_model()
-                    extracted_text = extract_text_qwen(image_path, qwen_model, qwen_processor)
-                elif model_choice == "Surya (English+Hindi)":
-                    langs = ["en", "hi"]
-                    predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
-                    text_list = re.findall(r"text='(.*?)'", str(predictions[0]))
-                    extracted_text = ' '.join(text_list)
         # Clean and polish extracted text
         cleaned_text = clean_extracted_text(extracted_text)
-        polished_text = polish_text_with_ai(cleaned_text) if model_choice in ["GOT_CPU", "GOT_GPU"] else cleaned_text
         # Save results to JSON file
-        result_data = {"extracted_text":extracted_text,"cleaner_text":cleaned_text,"polished_text": polished_text}
         with open(result_path, 'w') as f:
             json.dump(result_data, f)
@@ -197,9 +215,11 @@ if uploaded_file:
                 st.session_state["highlighted_result"] = extracted_text
         # Input search term with real-time update on key press
-        search_query = st_keyup("Search in extracted text:", key="search_key", on_change=update_search)
         # Display highlighted results if they exist in session state
         if "highlighted_result" in st.session_state:
             st.markdown("### Highlighted Search Results:")
-            st.markdown(st.session_state["highlighted_result"], unsafe_allow_html=True)

+import io
 import streamlit as st
 from transformers import AutoModel, AutoTokenizer, Qwen2VLForConditionalGeneration, AutoProcessor
 from PIL import Image
 import torch
 import os
 from st_keyup import st_keyup
 from st_img_pastebutton import paste
 # Page configuration
+st.set_page_config(page_title="DualTextOCRFusion",
+                   page_icon="🔍", layout="wide")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load GOT Models
 @st.cache_resource
 def init_got_model():
+    tokenizer = AutoTokenizer.from_pretrained(
+        'srimanth-d/GOT_CPU', trust_remote_code=True)
+    model = AutoModel.from_pretrained(
+        'srimanth-d/GOT_CPU', trust_remote_code=True, use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
     return model.eval(), tokenizer
 @st.cache_resource
 def init_got_gpu_model():
+    tokenizer = AutoTokenizer.from_pretrained(
+        'ucaslcl/GOT-OCR2_0', trust_remote_code=True)
+    model = AutoModel.from_pretrained('ucaslcl/GOT-OCR2_0', trust_remote_code=True, low_cpu_mem_usage=True,
+                                      device_map='cuda', use_safetensors=True, pad_token_id=tokenizer.eos_token_id)
     return model.eval().cuda(), tokenizer
 # Load Qwen Model
 @st.cache_resource
 def init_qwen_model():
+    model = Qwen2VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2-VL-2B-Instruct", device_map="cpu", torch_dtype=torch.float16)
     processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
     return model.eval(), processor
 # Text Cleaning AI - Clean spaces, handle dual languages
 def clean_extracted_text(text):
     cleaned_text = re.sub(r'\s+', ' ', text).strip()
     cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
     return cleaned_text
 # Polish the text using a model
 def polish_text_with_ai(cleaned_text):
     prompt = f"Remove unwanted spaces between and inside words to join incomplete words, creating a meaningful sentence in either Hindi, English, or Hinglish without altering any words from the given extracted text. Then, return the corrected text with adjusted spaces, keeping it as close to the original as possible, along with relevant details or insights that an AI can provide about the extracted text.  Extracted Text : {cleaned_text}"
+    client = Groq(
+        api_key="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")
     chat_completion = client.chat.completions.create(
         messages=[
             {
 def extract_text_qwen(image_file, model, processor):
     try:
         image = Image.open(image_file).convert('RGB')
+        conversation = [{"role": "user", "content": [{"type": "image"}, {
+            "type": "text", "text": "Extract text from this image."}]}]
+        text_prompt = processor.apply_chat_template(
+            conversation, add_generation_prompt=True)
+        inputs = processor(text=[text_prompt], images=[
+                           image], return_tensors="pt")
         output_ids = model.generate(**inputs)
+        output_text = processor.batch_decode(
+            output_ids, skip_special_tokens=True)
         return output_text[0] if output_text else "No text extracted from the image."
     except Exception as e:
         return f"An error occurred: {str(e)}"
 # Function to highlight the keyword in the text
 def highlight_text(text, search_term):
     if not search_term:  # If no search term is provided, return the original text
         return text
     # Highlight matched terms with yellow background
     return pattern.sub(lambda m: f'<span style="background-color: yellow;">{m.group()}</span>', text)
 # Title and UI
 st.title("DualTextOCRFusion - 🔍")
 st.header("OCR Application - Multimodel Support")
 # Sidebar Configuration
 st.sidebar.header("Configuration")
+model_choice = st.sidebar.selectbox(
+    "Select OCR Model:", ("GOT_CPU", "GOT_GPU", "Qwen"))
 # Upload Section
+uploaded_file = st.sidebar.file_uploader(
+    "Choose an image...", type=["png", "jpg", "jpeg"])
 # Input from clipboard
 # Paste image button
+image_data = paste(label="paste from clipboard", key="image_clipboard")
 if image_data is not None:
+    clipboard_use = True
+    header, encoded = image_data.split(",", 1)
+    decoded_bytes = base64.b64decode(encoded)
+    img_stream = io.BytesIO(decoded_bytes)
+    uploaded_file = img_stream
 # Input from camera
 camera_file = st.sidebar.camera_input("Capture from Camera")
 if uploaded_file:
     image = Image.open(uploaded_file)
     with col1:
+        col1.image(image, caption='Uploaded Image',
+                   use_column_width=False, width=300)
     # Save uploaded image to 'images' folder
     images_dir = 'images'
     os.makedirs(images_dir, exist_ok=True)
+    image_path = os.path.join(
+        images_dir, "temp_file.jpg" if clipboard_use else uploaded_file.name)
     with open(image_path, 'wb') as f:
         f.write(uploaded_file.getvalue())
     # Check if the result already exists
     results_dir = 'results'
     os.makedirs(results_dir, exist_ok=True)
+    result_path = os.path.join(
+        results_dir, "temp_file_result.json" if clipboard_use else f"{uploaded_file.name}_result.json")
     # Handle predictions
     if predict_button:
             with st.spinner("Processing..."):
                 if model_choice == "GOT_CPU":
                     got_model, tokenizer = init_got_model()
+                    extracted_text = extract_text_got(
+                        image_path, got_model, tokenizer)
                 elif model_choice == "GOT_GPU":
                     got_gpu_model, tokenizer = init_got_gpu_model()
+                    extracted_text = extract_text_got(
+                        image_path, got_gpu_model, tokenizer)
                 elif model_choice == "Qwen":
                     qwen_model, qwen_processor = init_qwen_model()
+                    extracted_text = extract_text_qwen(
+                        image_path, qwen_model, qwen_processor)
         # Clean and polish extracted text
         cleaned_text = clean_extracted_text(extracted_text)
+        polished_text = polish_text_with_ai(cleaned_text) if model_choice in [
+            "GOT_CPU", "GOT_GPU"] else cleaned_text
         # Save results to JSON file
+        result_data = {"extracted_text": extracted_text,
+                       "cleaner_text": cleaned_text, "polished_text": polished_text}
         with open(result_path, 'w') as f:
             json.dump(result_data, f)
                 st.session_state["highlighted_result"] = extracted_text
         # Input search term with real-time update on key press
+        search_query = st_keyup(
+            "Search in extracted text:", key="search_key", on_change=update_search)
         # Display highlighted results if they exist in session state
         if "highlighted_result" in st.session_state:
             st.markdown("### Highlighted Search Results:")
+            st.markdown(
+                st.session_state["highlighted_result"], unsafe_allow_html=True)