UniquePratham commited on
Commit
0efdb28
·
verified ·
1 Parent(s): 0c84dc0

Update app.py

Browse files

Updated App.py

Files changed (1) hide show
  1. app.py +26 -11
app.py CHANGED
@@ -40,6 +40,22 @@ def init_qwen_model():
40
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
41
  return model.eval(), processor
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  # Extract text using GOT
44
  def extract_text_got(image_file, model, tokenizer):
45
  return model.chat(tokenizer, image_file, ocr_type='ocr')
@@ -57,13 +73,6 @@ def extract_text_qwen(image_file, model, processor):
57
  except Exception as e:
58
  return f"An error occurred: {str(e)}"
59
 
60
- # Text Cleaning AI - Clean spaces, handle dual languages
61
- def clean_extracted_text(text):
62
- # Remove extra spaces
63
- cleaned_text = re.sub(r'\s+', ' ', text).strip()
64
- cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
65
- return cleaned_text
66
-
67
  # Highlight keyword search
68
  def highlight_text(text, search_term):
69
  if not search_term:
@@ -126,19 +135,25 @@ if predict_button and uploaded_file:
126
  # Clean extracted text
127
  cleaned_text = clean_extracted_text(extracted_text)
128
 
 
 
 
 
 
 
129
  # Delete temp file
130
  if os.path.exists(temp_file_path):
131
  os.remove(temp_file_path)
132
 
133
  # Display extracted text and search functionality
134
- st.subheader("Extracted Text (Cleaned)")
135
- st.markdown(cleaned_text, unsafe_allow_html=True)
136
 
137
  search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
138
  if search_query:
139
- highlighted_text = highlight_text(cleaned_text, search_query)
140
  st.markdown("### Highlighted Search Results:")
141
  st.markdown(highlighted_text, unsafe_allow_html=True)
142
  else:
143
  st.markdown("### Extracted Text:")
144
- st.markdown(cleaned_text, unsafe_allow_html=True)
 
40
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
41
  return model.eval(), processor
42
 
43
+ # Text Cleaning AI - Clean spaces, handle dual languages
44
+ def clean_extracted_text(text):
45
+ # Remove extra spaces
46
+ cleaned_text = re.sub(r'\s+', ' ', text).strip()
47
+ cleaned_text = re.sub(r'\s([?.!,])', r'\1', cleaned_text)
48
+ return cleaned_text
49
+
50
+ # Polish the text using a model
51
+ def polish_text_with_ai(cleaned_text, model, tokenizer):
52
+ # Use a text generation pipeline for better language flow
53
+ from transformers import pipeline
54
+ nlp = pipeline('text-generation', model=model, tokenizer=tokenizer)
55
+ prompt = f"Correct and clean the following text: '{cleaned_text}' and make it meaningful."
56
+ polished_text = nlp(prompt, max_length=100, num_return_sequences=1)[0]['generated_text']
57
+ return polished_text
58
+
59
  # Extract text using GOT
60
  def extract_text_got(image_file, model, tokenizer):
61
  return model.chat(tokenizer, image_file, ocr_type='ocr')
 
73
  except Exception as e:
74
  return f"An error occurred: {str(e)}"
75
 
 
 
 
 
 
 
 
76
  # Highlight keyword search
77
  def highlight_text(text, search_term):
78
  if not search_term:
 
135
  # Clean extracted text
136
  cleaned_text = clean_extracted_text(extracted_text)
137
 
138
+ # Optionally, polish text with AI model for better language flow
139
+ if model_choice in ["GOT_CPU", "GOT_GPU"]:
140
+ polished_text = polish_text_with_ai(cleaned_text, got_model, tokenizer)
141
+ else:
142
+ polished_text = cleaned_text
143
+
144
  # Delete temp file
145
  if os.path.exists(temp_file_path):
146
  os.remove(temp_file_path)
147
 
148
  # Display extracted text and search functionality
149
+ st.subheader("Extracted Text (Cleaned & Polished)")
150
+ st.markdown(polished_text, unsafe_allow_html=True)
151
 
152
  search_query = st.text_input("Search in extracted text:", key="search_query", placeholder="Type to search...")
153
  if search_query:
154
+ highlighted_text = highlight_text(polished_text, search_query)
155
  st.markdown("### Highlighted Search Results:")
156
  st.markdown(highlighted_text, unsafe_allow_html=True)
157
  else:
158
  st.markdown("### Extracted Text:")
159
+ st.markdown(polished_text, unsafe_allow_html=True)