Spaces:

AmithAdiraju1694
/

translatemyimage-beta

Paused

App Files Files Community

pr1

by AmithAdiraju1694 - opened Oct 11, 2024

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+137

-435

Files changed (7) hide show

.gitignore +1 -2
app.py +65 -51
inference/config.py +27 -16
inference/preprocess_image.py +5 -80
inference/translate.py +39 -57
pages.py +0 -214
utils.py +0 -15

.gitignore CHANGED Viewed

@@ -1,4 +1,3 @@
 misc.txt
 test_cas.py
-test_train_llm.py
-redir_app.py

 misc.txt
 test_cas.py
+test_train_llm.py

app.py CHANGED Viewed

@@ -1,64 +1,78 @@
-from utils import navigate_to
-from pages import manual_input_page, image_input_page, model_inference_page
 import streamlit as st
-from streamlit import session_state as sst
-import asyncio
-#TODO: Fix model inference and post processing function befor emoving ot production.
-# Initialize session state variable to start with home page
-if "page" not in sst:
-    sst["page"] = "Home"
-# function to remove all sesion variables from sst, except page.
-def reset_sst():
-    for key in list(sst.keys()):
-        if key != "page":
-            sst.pop(key, None)
-# Landing page function
-async def landing_page():
-    st.title("We will explain your menu like never before!")
-    st.write("\n")
-    st.write("\n")
-    st.write("\n")
-    c1, c2= st.columns(2)
-    with c1:
-        # Navigate to manual input page if user clicks on the button
-        st.button("Enter Items Manually", on_click=navigate_to, args=("ManualInput",))
-    with c2:
-        # Navigate to image input page if user clicks on the button
-        st.button("Upload Items from Image", on_click=navigate_to, args=("ImageInput",))
-# Main function to handle navigation
-async def main():
-    """
-    Main function that handles the navigation logic based on the current page.
-    Returns:
-        None
-    """
-    # Navigation logic
-    if sst["page"] == "Home":
-        reset_sst() # reset all session state variables before navigating to the landing page
-        await landing_page()  # Call the landing page function
-    elif sst["page"] == "ManualInput":
-        reset_sst() # reset all session state variables before navigating to the landing page
-        await manual_input_page()  # Call the manual input page function
-    elif sst["page"] == "ImageInput":
-        reset_sst() # reset all session state variables before navigating to the landing page
-        await image_input_page()  # Call the image input page function
-    elif sst["page"] == "Inference":
-        await model_inference_page()  # Call the model inference page function
-asyncio.run(main())

 import streamlit as st
+from inference.translate import (
+    extract_filter_img,
+    transcribe_menu_model,
+    load_models
+)
+from inference.config import DEBUG_MODE
+from PIL import Image
+import time
+# Streamlit app
+st.title("Image Upload and Processing")
+# Using open source text detector, LLM for explaining items
+text_extractor, \
+    item_tokenizer,item_summarizer = load_models(item_summarizer = "google/flan-t5-large")
+# Streamlit function to upload an image from any device
+uploaded_file = st.file_uploader("Choose an image...",
+                                 type=["jpg", "jpeg", "png"])
+# Submit button
+if uploaded_file is not None:
+    image = Image.open(uploaded_file)
+    # Only show if user wants to see
+    if st.checkbox('Show Uploaded Image'):
+        st.image(image,
+                 caption='Uploaded Image',
+                 use_column_width=True)
+    # Submit button
+    if st.button("Submit"):
+        msg1 = st.empty()
+        msg1.write("Pre-processing and extracting text out of your image ....")
+        st_filter = time.perf_counter()
+        # Call the extract_filter_img function
+        filtered_text = extract_filter_img(image, text_extractor)
+        en_filter = time.perf_counter()
+        num_items_detected = len(filtered_text)
+        if num_items_detected == 0:
+            st.write("We couldn't detect any menu items ( indian for now ) from your image, please try a different image.")
+        elif num_items_detected > 0:
+            st.write(f"Detected {num_items_detected} menu items ( indian ) from your input image ... ")
+            msg2 = st.empty()
+            msg2.write("All pre-processing done, transcribing your menu items now ....")
+            st_trans_llm = time.perf_counter()
+            translated_text_dict = transcribe_menu_model(menu_texts=filtered_text,
+                                                        text_tokenizer=item_tokenizer,
+                                                        text_summarizer=item_summarizer
+                                                        )
+            msg3 = st.empty()
+            msg3.write("Done transcribing ... ")
+            en_trans_llm = time.perf_counter()
+            msg1.empty(); msg2.empty(); msg3.empty()
+            st.success("Image processed successfully! " )
+            if DEBUG_MODE:
+                filter_time_sec = en_filter - st_filter
+                llm_time_sec = en_trans_llm - st_trans_llm
+                total_time_sec = filter_time_sec + llm_time_sec
+                st.write("Time took to extract and filter text {}".format(filter_time_sec))
+                st.write("Time took to summarize by LLM {}".format(llm_time_sec))
+                st.write('Overall time taken in seconds: {}'.format(total_time_sec))
+            st.table(translated_text_dict)

inference/config.py CHANGED Viewed

@@ -1,23 +1,34 @@
-import torch
-import re
-model_inf_inp_prompt = "INSTRUCTION: given food item name, explain these things:(major ingredients,making process,portion & spicy/sweet,pairs with,allergens,food type(veg/non-veg/vegan)). ensure to get allergens and food category factually correct.Item Name: {} "
-header_pattern = r'Item Name: (.*?)\. Major Ingredients: (.*?)\. Making Process: (.*?)\. Portion and Spice Level: (.*?)\. Pairs With: (.*?)\. Allergens: (.*?)\. Food Type: (.*?)\.\s*</s>'
-dots_pattern = re.compile(r'\.{3,}')
-DEBUG_MODE = True
-model_name = "AmithAdiraju1694/gpt-neo-125M_menuitemexp"
-def get_device():
-  if torch.cuda.is_available():
-    device = torch.device("cuda")
-    print(f"Using GPU: {torch.cuda.get_device_name(0)}") #get the name of the GPU being used.
-  else:
-      device = torch.device("cpu")
-      print("Using CPU")
-  return device
-DEVICE = get_device()

+INSTRUCTION_PROMPT = """
+The following text contains examples of three items and their corresponding explanations in the required format.\n
+Item -> palak paneer.\n
+Explanation -> Major Ingredients here: paneer ( a.k.a cottage cheese ) , palak ( spinach ).\n
+How it is made: It's a savory item, made like a gravy; usually made by sauteing spices and mixing saute with boiled paneer and palak.\n
+It goes well with: White basmati rice or Indian flat bread.\n
+Allergens: Paneer may cause digestive discomfort and intolerance to some.\n
+Food Category: Vegetarian, Vegans may not like it, as paneer is usually made from cow milk.
+Item -> rumali roti.\n
+Explanation -> Major Ingredients here: roti.\n
+How it is made: A small soft bread, made to size of a napkin ( a.k.a 'rumal' in hindi ); usually made with a combination of whole wheat and all purpose flour.\n
+It goes well with: Most indian gravies such as palak paneer, tomato curry etc.\n
+Allergens: May contain gluten, which is known to cause digestive discomfort and intolerance to some.\n
+Food Category: Vegetarian, Vegan.
+Item -> nizami handi.\n
+Explanation -> Major Ingredients here: Different veggies, makhani sauce (skimmed milk, tomato and cashew paste , indian spices), combination of nuts.\n
+How it is made: Makhani sauce is added to onion-tomato based paste and bought to a boil; a Medley of veggies and gently flavored whole spices are added and boiled for small time.\n
+It goes well with: Different kinds of indian flat breads, white basmati and sonamasoori rice.\n
+Allergens: Presence of nuts, butter cream and makhani sauce are known to cause digestive discomfort and intolerance to some.\n
+Food Category: Usually vegetarian, may include chicken or animal meat sometimes, please check with hotel.
+Based on Item and explanation pairs provided above, provide similar explanation ('Major Ingredients', 'How is it made', 'It goes well with', 'Allergens' and 'Food Category') to the below item.\n
+Item ->
+"""
+DEBUG_MODE = True
+DEVICE = 'cpu'

inference/preprocess_image.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import numpy as np
-from typing import List, Tuple, Optional, AnyStr, Dict
 import nltk
 nltk.download("stopwords")
 nltk.download('punkt')
@@ -11,18 +11,6 @@ import re
 def preprocess_text(sentence: AnyStr) -> AnyStr:
-    """
-    Function that pre-processes input text by removing special characters, hyper links,
-    numbers and by removing stop words
-    Parameters:
-        sentence: str, required -> A raw string which may have stop words, special chars etc.
-    Returns:
-        return_txt: str -> A clean string with all aforementioned, removed.
-    """
     sentence=sentence.lower().replace('{html}',"")
     cleanr = re.compile('<.*?>')
     cleantext = re.sub(cleanr, '', sentence)
@@ -39,78 +27,15 @@ def preprocess_text(sentence: AnyStr) -> AnyStr:
     return return_txt
 def image_to_np_arr(image) -> np.array:
-    """
-    Function that converts a byte array image into a floating pointer numpy array.
-    Parameters:
-        inp_texts: List[str], required -> List of strings, containing item names of a menu in english.
-    Returns:
-        np.ndarray
-    """
     return np.array(image)
-async def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
-    """
-    Function that processes extracted text by removing numbers and special characters,
-    and filters out text with less than 2 words.
-    Parameters:
-        raw_extrc_text: List[Tuple], required -> A list of tuples containing extracted text.
-    Returns:
-        List[AnyStr] -> A list of processed text strings.
-    """
     output_texts = []
     for _, extr_text, _ in raw_extrc_text:
         # remove all numbers, special characters from a string
         prcsd_txt = preprocess_text(extr_text)
-        if len(prcsd_txt.split(" ")) >= 2:
-            output_texts.append(prcsd_txt)
-    return output_texts
-def post_process_gen_outputs(gen_output: List[str], header_pattern: str, dots_pattern:str) -> List[Dict]:
-    # Define the regular expression pattern to match section names and placeholders
-    headers = ["Item Name", "Major Ingredients", "Making Process", "Portion and Spice Level", "Pairs With", "Allergens", "Food Type"]
-    # Function to clean the strings
-    def clean_string(input_string):
-        parts = input_string.split(',')
-        cleaned_parts = [part.strip() for part in parts if part.strip()]
-        return ', '.join(cleaned_parts)
-    for i in range(len(gen_output)):
-        # Find all matches
-        matches = re.findall(header_pattern, gen_output[i])
-        # Since re.findall returns a list of tuples, we need to extract the first tuple
-        if matches:
-            result = dict(zip(headers,matches[0]))
-            result['Major Ingredients'] = clean_string(result['Major Ingredients'])
-            # if any of dictionary values strings are emtpy, replace it with string "Sorry, can't explain this."
-            for k in result.keys():
-                if len(result[k]) < 3 or any(header in result[k] for header in headers):
-                    result[k] = "Sorry, can't explain this."
-            gen_output[i] = result
-        else:
-            if headers[1] in gen_output[i]:
-                gen_output[i] = {"May contain misleading explanation":
-                                 dots_pattern.sub('' ,
-                                                  gen_output[i].split(headers[1]
-                                                   )[1].strip().replace('</s>', '')
-                                                  )
-                             }
-            else:
-                gen_output[i] = {"Sorry, can't explain this item": "NA"}
-        gen_output[i].pop('Item Name', None)
-    return gen_output

 import numpy as np
+from typing import List, Tuple, Optional, AnyStr
 import nltk
 nltk.download("stopwords")
 nltk.download('punkt')
 def preprocess_text(sentence: AnyStr) -> AnyStr:
     sentence=sentence.lower().replace('{html}',"")
     cleanr = re.compile('<.*?>')
     cleantext = re.sub(cleanr, '', sentence)
     return return_txt
 def image_to_np_arr(image) -> np.array:
     return np.array(image)
+def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
     output_texts = []
     for _, extr_text, _ in raw_extrc_text:
         # remove all numbers, special characters from a string
         prcsd_txt = preprocess_text(extr_text)
+        if len(prcsd_txt.split(" ") ) > 2: output_texts.append(prcsd_txt)
+    return output_texts

inference/translate.py CHANGED Viewed

@@ -2,54 +2,21 @@ import streamlit as st
 from inference.preprocess_image import (
     image_to_np_arr,
-    process_extracted_text,
-    post_process_gen_outputs
 )
-from inference.config import (
-     model_inf_inp_prompt,
-    header_pattern,
-    dots_pattern,
-    DEVICE,
-    model_name
-                             )
 from typing import List, Tuple, Optional, AnyStr, Dict
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import easyocr
 import time
 use_gpu = True
-if DEVICE.type == 'cpu': use_gpu = False
-@st.cache_resource
-def load_models(item_summarizer: AnyStr) -> Tuple:
-    """
-    Function to load the models required for the inference process. Cached to avoid loading the models, every time the function is called.
-    Parameters:
-        item_summarizer: str, required -> The LLM model name to be used for item summarization.
-    Returns:
-        Tuple -> Tuple containing the required models for the inference process.
-    """
-    # model to extract text from image
-    text_extractor = easyocr.Reader(['en'],
-                                    gpu = use_gpu
-                                    )
-    # tokenizer and model to generate item summary
-    tokenizer = AutoTokenizer.from_pretrained(item_summarizer)
-    model = AutoModelForCausalLM.from_pretrained(item_summarizer)
-    return (text_extractor, tokenizer, model)
-text_extractor,item_tokenizer,item_summarizer = load_models(item_summarizer = model_name)
 # Define your extract_filter_img function
-async def extract_filter_img(image) -> Dict:
     """
     1. Convert Image to numpy array
@@ -81,8 +48,7 @@ async def extract_filter_img(image) -> Dict:
         if i in ind_add_delays:
             time.sleep(0.5)
-        if i == 2: result = await func(result)
-        else: result = func(result)
         status_message.write(end_message)
@@ -97,26 +63,42 @@ async def extract_filter_img(image) -> Dict:
     return result
-def transcribe_menu_model(menu_text: List[AnyStr]) -> Dict:
-    prompt_item = model_inf_inp_prompt.format(menu_text)
-    input_ids = item_tokenizer(prompt_item, return_tensors="pt").input_ids
-    outputs = item_summarizer.generate(input_ids,
-                                       max_new_tokens = 512,
-                                       num_beams = 4,
-                                       pad_token_id = item_tokenizer.pad_token_id,
-                                       eos_token_id = item_tokenizer.eos_token_id,
-                                       bos_token_id = item_tokenizer.bos_token_id
-                                       )
-    prediction = item_tokenizer.batch_decode(outputs,
-                                        skip_special_tokens=False
-                                        )
-    postpro_output = post_process_gen_outputs( prediction, header_pattern, dots_pattern )[0]
-    return postpro_output
 def classify_menu_text(extrc_str: List[AnyStr]) -> List[AnyStr]:
     return extrc_str

 from inference.preprocess_image import (
     image_to_np_arr,
+    process_extracted_text
 )
+from inference.config import INSTRUCTION_PROMPT, DEVICE
 from typing import List, Tuple, Optional, AnyStr, Dict
+from transformers import T5Tokenizer, T5ForConditionalGeneration
 import easyocr
 import time
 use_gpu = True
+if DEVICE == 'cpu': use_gpu = False
 # Define your extract_filter_img function
+def extract_filter_img(image, text_extractor) -> Dict:
     """
     1. Convert Image to numpy array
         if i in ind_add_delays:
             time.sleep(0.5)
+        result = func(result)
         status_message.write(end_message)
     return result
+def transcribe_menu_model(menu_texts: List[AnyStr],
+                          text_summarizer = None,
+                          text_tokenizer = None) -> Dict:
+    summarized_menu_items = {}
+    for mi in menu_texts:
+        if not text_summarizer:
+            raise NotImplementedError(""" """)
+        else:
+            prompt_item = INSTRUCTION_PROMPT + " " + mi + """
+"""
+            input_ids = text_tokenizer(prompt_item, return_tensors="pt").input_ids
+            outputs = text_summarizer.generate(input_ids,
+                                               max_new_tokens = 512
+                                               )
+            summarized_menu_items[mi] = text_tokenizer.decode(
+                outputs[0],
+                skip_special_tokens = True
+                )
+    return summarized_menu_items
+def load_models(item_summarizer: AnyStr) -> Tuple:
+    text_extractor = easyocr.Reader(['en'],
+                                    gpu = use_gpu
+                                    )
+    tokenizer = T5Tokenizer.from_pretrained(item_summarizer)
+    model = T5ForConditionalGeneration.from_pretrained(item_summarizer)
+    return (text_extractor, tokenizer, model)
 def classify_menu_text(extrc_str: List[AnyStr]) -> List[AnyStr]:
     return extrc_str

pages.py DELETED Viewed

@@ -1,214 +0,0 @@
-import streamlit as st
-from streamlit import session_state as sst
-from utils import navigate_to
-from inference.config import DEBUG_MODE
-from inference.translate import extract_filter_img, transcribe_menu_model,classify_menu_text
-from inference.preprocess_image import preprocess_text
-import os
-import time
-import pandas as pd
-from PIL import Image
-from typing import List
-import json
-from concurrent.futures import ThreadPoolExecutor, as_completed
-# Setting workers to be 70% of all available virtual cpus in system
-cpu_count = os.cpu_count()
-pool = ThreadPoolExecutor(max_workers=int(cpu_count*0.7) )
-# Function that handles logic of explaining menu items from manual input
-async def manual_input_page():
-    """
-    Function that takes text input from user in input box of streamlit, user can add multiple text boxes and submit finally.
-    Parameters:
-        None
-    Returns:
-        List[str]: List of strings, containing item names of a menu in english.
-    """
-    st.write("This is the Manual Input Page.")
-    st.write("Once done, click on 'Explain My Menu' button to get explanations for each item ... ")
-    inp_texts = []
-    num_text_boxes = st.number_input("Number of text boxes", min_value=1, step=1)
-    for i in range(num_text_boxes):
-        text_box = st.text_input(f"Food item {i+1}")
-        if text_box:
-            inp_texts.append(text_box)
-    if len(inp_texts) > 0:
-        # Show user submit button only if they have entered some text and set text in session state
-        sst["user_entered_items"] = inp_texts
-        st.button("Explain My Menu",on_click=navigate_to,args=("Inference",))
-    else:
-        st.write("Please enter some items to proceed ...")
-    st.button("Go back Home", on_click=navigate_to, args=("Home",))
-# Function that handles logic of explaining menu items from image uploads
-async def image_input_page():
-    """
-    Function that contains content of main page i.e., image uploader and submit button to navigate to next page.
-    Upon submit , control goes to model inference 'page'.
-    Parameters:
-        None
-    Returns:
-        None
-    """
-    st.write("This is the Image Input Page.")
-    # Streamlit function to upload an image from any device
-    uploaded_file = st.file_uploader("Choose an image...",
-                                 type=["jpg", "jpeg", "png"])
-    # Remove preivous states' value of input image if it exists
-    sst.pop('input_image', None)
-    # Submit button
-    if uploaded_file is not None:
-        image = Image.open(uploaded_file)
-        # Only show if user wants to see
-        if st.checkbox('Show Uploaded Image'):
-            st.image(image,
-                    caption='Uploaded Image',
-                    use_column_width=True)
-        sst["input_image"] = image
-        # Show user submit button only if they have uploaded an image
-        st.button("Translate My Menu",
-                  on_click = navigate_to,
-                  args = ("Inference",))
-        # Warning message to user
-        st.info("""This application is for education purposes only. It uses AI, hence it's dietary
-                    recommendations are not to be taken as medical advice, author doesn't bear responsibility
-                    for incorrect dietary recommendations. Please proceed with caution.
-                    """)
-    # if user wants to go back, make sure to reset the session state
-    st.button("Go back Home", on_click=navigate_to, args=("Home",))
-# Function that handles model inference
-async def model_inference_page():
-    """
-    Function that pre-processes input text from state variables, does concurrent inference
-    and toggles state between pages if needed.
-    Parameters:
-        None
-    Returns:
-        None
-    """
-    second_title = st.empty()
-    second_title.title(" Using ML to explain your menu items ... ")
-    # User can either upload an image or enter text manually, we check for both
-    if "input_image" in sst:
-        image = sst["input_image"]
-        msg1 = st.empty()
-        msg1.write("Pre-processing and extracting text out of your image ....")
-        # Call the extract_filter_img function
-        filtered_text = await extract_filter_img(image)
-        num_items_detected = len(filtered_text)
-    if "user_entered_items" in sst:
-        user_text = sst["user_entered_items"]
-        st.write("Pre-processing and filtering text from user input ....")
-        filtered_text = [preprocess_text(ut) for ut in user_text]
-        num_items_detected = len(filtered_text)
-    # irrespective of source of user entry , we check if we have any items to process
-    if num_items_detected == 0:
-        st.write("We couldn't detect any menu items ( indian for now ) from your image, please try a different image by going back.")
-    elif num_items_detected > 0:
-        st.write(f"Detected {num_items_detected} menu items from your input image ... ")
-        msg2 = st.empty()
-        msg2.write("All pre-processing done, transcribing your menu items now ....")
-        st_trans_llm = time.perf_counter()
-        await dist_llm_inference(filtered_text)
-        msg3 = st.empty()
-        msg3.write("Done transcribing ... ")
-        en_trans_llm = time.perf_counter()
-        msg2.empty(); msg3.empty()
-        st.success("Image processed successfully! " )
-        # Some basic stats for debug mode
-        if DEBUG_MODE:
-            llm_time_sec = en_trans_llm - st_trans_llm
-            st.write("Time took to summarize by LLM {}".format(llm_time_sec))
-    # If user clicked in "translate_another" button reset all session state variables and go back to home
-    st.button("Go back Home", on_click=navigate_to, args=("Home",))
-# Function that performs LLM inference on a single item
-async def dist_llm_inference(inp_texts: List[str]) -> None:
-    """
-    Function that performs concurrent LLM inference using threadpool. It displays
-    results of those threads that are done with execution, as a dynamic row to streamlit table, rather than
-    waiting for all threads to be done.
-    Parameters:
-        inp_texts: List[str], required -> List of strings, containing item names of a menu in english.
-    Returns:
-        None
-    """
-    df = pd.DataFrame([('ITEM NAME', 'EXPLANATION')]
-                     )
-    sl_table = st.table(df)
-    tp_futures = { pool.submit(transcribe_menu_model, mi): mi for mi in inp_texts }
-    for tpftr in as_completed(tp_futures):
-        item = tp_futures[tpftr]
-        try:
-            exp = tpftr.result()
-            sl_table.add_rows([(item,
-                                str(exp ))
-                                ]
-                                )
-        except Exception as e:
-            print("Could not add a new row dynamically, because of this error:", e)
-    return

utils.py DELETED Viewed

@@ -1,15 +0,0 @@
-from streamlit import session_state as sst
-def navigate_to(page: str) -> None:
-    """
-    Function to set the current page in the state of streamlit. A helper for
-    simulating navigation in streamlit.
-    Parameters:
-        page: str, required.
-    Returns:
-        None
-    """
-    sst["page"] = page