Spaces:

AmithAdiraju1694
/

translatemyimage-beta

Paused

App Files Files Community

Amith Adiraju commited on Sep 30, 2024

Commit

9a0f501

1 Parent(s): ca2b51e

Copied entire codebase from working streamlit application from personal git. This application lets users upload images of item menus ( only supports indian item menus for now ); using opencv and llms, this application will explain each item in the menu in specific format.

Browse files

Files changed (8) hide show

requirements.txt +12 -0
src/__init__.py +0 -0
src/inference/__init__.py +0 -0
src/inference/config.py +34 -0
src/inference/preprocess_image.py +41 -0
src/inference/translate.py +105 -0
src/main.py +71 -0
src/requirements.txt +11 -0

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+streamlit==1.37.1
+pandas==2.2.2
+altair
+easyocr==1.6.2
+matplotlib==3.7.1
+numpy==1.24.2
+lorem==0.1.1
+Pillow==9.5.0
+nltk==3.9.1
+torch==2.1.0
+transformers==4.44.2
+sentencepiece

src/__init__.py ADDED Viewed

File without changes

src/inference/__init__.py ADDED Viewed

File without changes

src/inference/config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+INSTRUCTION_PROMPT = """
+The following text contains examples of three items and their corresponding explanations in the required format.\n
+Item -> palak paneer.\n
+Explanation -> Major Ingredients here: paneer ( a.k.a cottage cheese ) , palak ( spinach ).\n
+How it is made: It's a savory item, made like a gravy; usually made by sauteing spices and mixing saute with boiled paneer and palak.\n
+It goes well with: White basmati rice or Indian flat bread.\n
+Allergens: Paneer may cause digestive discomfort and intolerance to some.\n
+Food Category: Vegetarian, Vegans may not like it, as paneer is usually made from cow milk.
+Item -> rumali roti.\n
+Explanation -> Major Ingredients here: roti.\n
+How it is made: A small soft bread, made to size of a napkin ( a.k.a 'rumal' in hindi ); usually made with a combination of whole wheat and all purpose flour.\n
+It goes well with: Most indian gravies such as palak paneer, tomato curry etc.\n
+Allergens: May contain gluten, which is known to cause digestive discomfort and intolerance to some.\n
+Food Category: Vegetarian, Vegan.
+Item -> nizami handi.\n
+Explanation -> Major Ingredients here: Different veggies, makhani sauce (skimmed milk, tomato and cashew paste , indian spices), combination of nuts.\n
+How it is made: Makhani sauce is added to onion-tomato based paste and bought to a boil; a Medley of veggies and gently flavored whole spices are added and boiled for small time.\n
+It goes well with: Different kinds of indian flat breads, white basmati and sonamasoori rice.\n
+Allergens: Presence of nuts, butter cream and makhani sauce are known to cause digestive discomfort and intolerance to some.\n
+Food Category: Usually vegetarian, may include chicken or animal meat sometimes, please check with hotel.
+Based on Item and explanation pairs provided above, provide similar explanation ('Major Ingredients', 'How is it made', 'It goes well with', 'Allergens' and 'Food Category') to the below item.\n
+Item ->
+"""
+DEBUG_MODE = True
+DEVICE = 'cpu'

src/inference/preprocess_image.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+from typing import List, Tuple, Optional, AnyStr
+import nltk
+nltk.download("stopwords")
+nltk.download('punkt')
+from nltk.tokenize import RegexpTokenizer
+from nltk.corpus import stopwords
+import re
+def preprocess_text(sentence: AnyStr) -> AnyStr:
+    sentence=sentence.lower().replace('{html}',"")
+    cleanr = re.compile('<.*?>')
+    cleantext = re.sub(cleanr, '', sentence)
+    rem_url=re.sub(r'http\S+', '',cleantext)
+    rem_num = re.sub('[0-9]+', '', rem_url)
+    tokenizer = RegexpTokenizer(r'\w+')
+    tokens = tokenizer.tokenize(rem_num)
+    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
+    return_txt = " ".join(filtered_words)
+    return return_txt
+def image_to_np_arr(image) -> np.array:
+    return np.array(image)
+def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
+    output_texts = []
+    for _, extr_text, _ in raw_extrc_text:
+        # remove all numbers, special characters from a string
+        prcsd_txt = preprocess_text(extr_text)
+        if len(prcsd_txt.split(" ") ) > 2: output_texts.append(prcsd_txt)
+    return output_texts

src/inference/translate.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import streamlit as st
+from inference.preprocess_image import (
+    image_to_np_arr,
+    process_extracted_text
+)
+from inference.config import INSTRUCTION_PROMPT, DEVICE
+from typing import List, Tuple, Optional, AnyStr, Dict
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import easyocr
+import time
+use_gpu = True
+if DEVICE == 'cpu': use_gpu = False
+# Define your extract_filter_img function
+def extract_filter_img(image, text_extractor) -> Dict:
+    """
+    1. Convert Image to numpy array
+    2. Detect & Extract Text from Image - List of Tuples
+    3. Process text , to filter out irrelevant text
+    4. Classify only menu-related strings from detected text
+    """
+    progress_bar = st.progress(0)
+    status_message = st.empty()
+    functions_messages = [
+        (image_to_np_arr, 'Converting Image to required format', 'Done Converting !'),
+        (text_extractor.readtext, 'Extracting text from inp image', 'Done Extracting !'),
+        (process_extracted_text, 'Clean Raw Extracted text', 'Done Cleaning !'),
+        (classify_menu_text, 'Removing non-menu related text', 'Done removing !'),
+    ]
+    # Initialize variables
+    result = image
+    total_steps = len(functions_messages)
+    ind_add_delays = [0, 2, 3, 4]
+    # Loop through each function and execute it with status update
+    for i, (func, start_message, end_message) in enumerate(functions_messages):
+        status_message.write(start_message)
+        if i in ind_add_delays:
+            time.sleep(0.5)
+        result = func(result)
+        status_message.write(end_message)
+        # Update the progress bar
+        progress_bar.progress((i + 1) / total_steps)
+        if i in ind_add_delays:
+            time.sleep(0.5)
+    progress_bar.empty()
+    status_message.empty()
+    return result
+def transcribe_menu_model(menu_texts: List[AnyStr],
+                          text_summarizer = None,
+                          text_tokenizer = None) -> Dict:
+    summarized_menu_items = {}
+    for mi in menu_texts:
+        if not text_summarizer:
+            raise NotImplementedError(""" """)
+        else:
+            prompt_item = INSTRUCTION_PROMPT + " " + mi + """
+"""
+            input_ids = text_tokenizer(prompt_item, return_tensors="pt").input_ids
+            outputs = text_summarizer.generate(input_ids,
+                                               max_new_tokens = 512
+                                               )
+            summarized_menu_items[mi] = text_tokenizer.decode(
+                outputs[0],
+                skip_special_tokens = True
+                )
+    return summarized_menu_items
+def load_models(item_summarizer: AnyStr) -> Tuple:
+    text_extractor = easyocr.Reader(['en'],
+                                    gpu = use_gpu
+                                    )
+    tokenizer = T5Tokenizer.from_pretrained(item_summarizer)
+    model = T5ForConditionalGeneration.from_pretrained(item_summarizer)
+    return (text_extractor, tokenizer, model)
+def classify_menu_text(extrc_str: List[AnyStr]) -> List[AnyStr]:
+    return extrc_str

src/main.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import streamlit as st
+from inference.translate import (
+    extract_filter_img,
+    transcribe_menu_model,
+    load_models
+)
+from inference.config import DEBUG_MODE
+from PIL import Image
+import time
+# Streamlit app
+st.title("Image Upload and Processing")
+# Using open source text detector, LLM for explaining items
+text_extractor, \
+    item_tokenizer,item_summarizer = load_models(item_summarizer = "google/flan-t5-large")
+# Streamlit function to upload an image from any device
+uploaded_file = st.file_uploader("Choose an image...",
+                                 type=["jpg", "jpeg", "png"])
+# Submit button
+if uploaded_file is not None:
+    image = Image.open(uploaded_file)
+    # Only show if user wants to see
+    if st.checkbox('Show Uploaded Image'):
+        st.image(image,
+                 caption='Uploaded Image',
+                 use_column_width=True)
+    # Submit button
+    if st.button("Submit"):
+        msg1 = st.empty()
+        msg1.write("Pre-processing and extracting text out of your image ....")
+        st_filter = time.perf_counter()
+        # Call the extract_filter_img function
+        filtered_text = extract_filter_img(image, text_extractor)
+        en_filter = time.perf_counter()
+        msg2 = st.empty()
+        msg2.write("All pre-processing done, transcribing your menu items now ....")
+        st_trans_llm = time.perf_counter()
+        translated_text_dict = transcribe_menu_model(menu_texts=filtered_text,
+                                                     text_tokenizer=item_tokenizer,
+                                                     text_summarizer=item_summarizer
+                                                    )
+        msg3 = st.empty()
+        msg3.write("Done transcribing ... ")
+        en_trans_llm = time.perf_counter()
+        msg1.empty(); msg2.empty(); msg3.empty()
+        st.success("Image processed successfully! " )
+        if DEBUG_MODE:
+            filter_time_sec = en_filter - st_filter
+            llm_time_sec = en_trans_llm - st_trans_llm
+            total_time_sec = filter_time_sec + llm_time_sec
+            st.write("Time took to extract and filter text {}".format(filter_time_sec))
+            st.write("Time took to summarize by LLM {}".format(llm_time_sec))
+            st.write('Overall time taken in seconds: {}'.format(total_time_sec))
+        st.table(translated_text_dict)

src/requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+sentencepiece==0.2.0
+transformers==4.44.2
+streamlit==1.37.1
+pandas==2.2.2
+altair
+easyocr==1.6.2
+matplotlib==3.7.1
+numpy==1.24.2
+Pillow==9.5.0
+nltk==3.9.1
+torch==2.1.0