Amith Adiraju
commited on
Commit
·
9a0f501
1
Parent(s):
ca2b51e
Copied entire codebase from working streamlit application from personal git. This application lets users upload images of item menus ( only supports indian item menus for now ); using opencv and llms, this application will explain each item in the menu in specific format.
Browse files- requirements.txt +12 -0
- src/__init__.py +0 -0
- src/inference/__init__.py +0 -0
- src/inference/config.py +34 -0
- src/inference/preprocess_image.py +41 -0
- src/inference/translate.py +105 -0
- src/main.py +71 -0
- src/requirements.txt +11 -0
requirements.txt
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.37.1
|
2 |
+
pandas==2.2.2
|
3 |
+
altair
|
4 |
+
easyocr==1.6.2
|
5 |
+
matplotlib==3.7.1
|
6 |
+
numpy==1.24.2
|
7 |
+
lorem==0.1.1
|
8 |
+
Pillow==9.5.0
|
9 |
+
nltk==3.9.1
|
10 |
+
torch==2.1.0
|
11 |
+
transformers==4.44.2
|
12 |
+
sentencepiece
|
src/__init__.py
ADDED
File without changes
|
src/inference/__init__.py
ADDED
File without changes
|
src/inference/config.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INSTRUCTION_PROMPT = """
|
2 |
+
The following text contains examples of three items and their corresponding explanations in the required format.\n
|
3 |
+
|
4 |
+
Item -> palak paneer.\n
|
5 |
+
Explanation -> Major Ingredients here: paneer ( a.k.a cottage cheese ) , palak ( spinach ).\n
|
6 |
+
How it is made: It's a savory item, made like a gravy; usually made by sauteing spices and mixing saute with boiled paneer and palak.\n
|
7 |
+
It goes well with: White basmati rice or Indian flat bread.\n
|
8 |
+
Allergens: Paneer may cause digestive discomfort and intolerance to some.\n
|
9 |
+
Food Category: Vegetarian, Vegans may not like it, as paneer is usually made from cow milk.
|
10 |
+
|
11 |
+
|
12 |
+
Item -> rumali roti.\n
|
13 |
+
Explanation -> Major Ingredients here: roti.\n
|
14 |
+
How it is made: A small soft bread, made to size of a napkin ( a.k.a 'rumal' in hindi ); usually made with a combination of whole wheat and all purpose flour.\n
|
15 |
+
It goes well with: Most indian gravies such as palak paneer, tomato curry etc.\n
|
16 |
+
Allergens: May contain gluten, which is known to cause digestive discomfort and intolerance to some.\n
|
17 |
+
Food Category: Vegetarian, Vegan.
|
18 |
+
|
19 |
+
|
20 |
+
Item -> nizami handi.\n
|
21 |
+
Explanation -> Major Ingredients here: Different veggies, makhani sauce (skimmed milk, tomato and cashew paste , indian spices), combination of nuts.\n
|
22 |
+
How it is made: Makhani sauce is added to onion-tomato based paste and bought to a boil; a Medley of veggies and gently flavored whole spices are added and boiled for small time.\n
|
23 |
+
It goes well with: Different kinds of indian flat breads, white basmati and sonamasoori rice.\n
|
24 |
+
Allergens: Presence of nuts, butter cream and makhani sauce are known to cause digestive discomfort and intolerance to some.\n
|
25 |
+
Food Category: Usually vegetarian, may include chicken or animal meat sometimes, please check with hotel.
|
26 |
+
|
27 |
+
|
28 |
+
Based on Item and explanation pairs provided above, provide similar explanation ('Major Ingredients', 'How is it made', 'It goes well with', 'Allergens' and 'Food Category') to the below item.\n
|
29 |
+
Item ->
|
30 |
+
"""
|
31 |
+
|
32 |
+
DEBUG_MODE = True
|
33 |
+
|
34 |
+
DEVICE = 'cpu'
|
src/inference/preprocess_image.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import numpy as np
|
3 |
+
from typing import List, Tuple, Optional, AnyStr
|
4 |
+
import nltk
|
5 |
+
nltk.download("stopwords")
|
6 |
+
nltk.download('punkt')
|
7 |
+
|
8 |
+
from nltk.tokenize import RegexpTokenizer
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
import re
|
11 |
+
|
12 |
+
|
13 |
+
def preprocess_text(sentence: AnyStr) -> AnyStr:
|
14 |
+
sentence=sentence.lower().replace('{html}',"")
|
15 |
+
cleanr = re.compile('<.*?>')
|
16 |
+
cleantext = re.sub(cleanr, '', sentence)
|
17 |
+
|
18 |
+
rem_url=re.sub(r'http\S+', '',cleantext)
|
19 |
+
rem_num = re.sub('[0-9]+', '', rem_url)
|
20 |
+
tokenizer = RegexpTokenizer(r'\w+')
|
21 |
+
|
22 |
+
tokens = tokenizer.tokenize(rem_num)
|
23 |
+
filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
|
24 |
+
|
25 |
+
return_txt = " ".join(filtered_words)
|
26 |
+
|
27 |
+
return return_txt
|
28 |
+
|
29 |
+
def image_to_np_arr(image) -> np.array:
|
30 |
+
return np.array(image)
|
31 |
+
|
32 |
+
def process_extracted_text(raw_extrc_text: List[Tuple]) -> List[AnyStr]:
|
33 |
+
|
34 |
+
output_texts = []
|
35 |
+
for _, extr_text, _ in raw_extrc_text:
|
36 |
+
# remove all numbers, special characters from a string
|
37 |
+
prcsd_txt = preprocess_text(extr_text)
|
38 |
+
|
39 |
+
if len(prcsd_txt.split(" ") ) > 2: output_texts.append(prcsd_txt)
|
40 |
+
|
41 |
+
return output_texts
|
src/inference/translate.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from inference.preprocess_image import (
|
4 |
+
image_to_np_arr,
|
5 |
+
process_extracted_text
|
6 |
+
)
|
7 |
+
|
8 |
+
from inference.config import INSTRUCTION_PROMPT, DEVICE
|
9 |
+
from typing import List, Tuple, Optional, AnyStr, Dict
|
10 |
+
from transformers import T5Tokenizer, T5ForConditionalGeneration
|
11 |
+
import easyocr
|
12 |
+
import time
|
13 |
+
|
14 |
+
use_gpu = True
|
15 |
+
if DEVICE == 'cpu': use_gpu = False
|
16 |
+
|
17 |
+
|
18 |
+
# Define your extract_filter_img function
|
19 |
+
def extract_filter_img(image, text_extractor) -> Dict:
|
20 |
+
|
21 |
+
"""
|
22 |
+
1. Convert Image to numpy array
|
23 |
+
2. Detect & Extract Text from Image - List of Tuples
|
24 |
+
3. Process text , to filter out irrelevant text
|
25 |
+
4. Classify only menu-related strings from detected text
|
26 |
+
|
27 |
+
"""
|
28 |
+
|
29 |
+
progress_bar = st.progress(0)
|
30 |
+
status_message = st.empty()
|
31 |
+
|
32 |
+
functions_messages = [
|
33 |
+
(image_to_np_arr, 'Converting Image to required format', 'Done Converting !'),
|
34 |
+
(text_extractor.readtext, 'Extracting text from inp image', 'Done Extracting !'),
|
35 |
+
(process_extracted_text, 'Clean Raw Extracted text', 'Done Cleaning !'),
|
36 |
+
(classify_menu_text, 'Removing non-menu related text', 'Done removing !'),
|
37 |
+
]
|
38 |
+
|
39 |
+
# Initialize variables
|
40 |
+
result = image
|
41 |
+
total_steps = len(functions_messages)
|
42 |
+
ind_add_delays = [0, 2, 3, 4]
|
43 |
+
|
44 |
+
# Loop through each function and execute it with status update
|
45 |
+
for i, (func, start_message, end_message) in enumerate(functions_messages):
|
46 |
+
status_message.write(start_message)
|
47 |
+
|
48 |
+
if i in ind_add_delays:
|
49 |
+
time.sleep(0.5)
|
50 |
+
|
51 |
+
result = func(result)
|
52 |
+
|
53 |
+
status_message.write(end_message)
|
54 |
+
|
55 |
+
# Update the progress bar
|
56 |
+
progress_bar.progress((i + 1) / total_steps)
|
57 |
+
|
58 |
+
if i in ind_add_delays:
|
59 |
+
time.sleep(0.5)
|
60 |
+
|
61 |
+
progress_bar.empty()
|
62 |
+
status_message.empty()
|
63 |
+
return result
|
64 |
+
|
65 |
+
|
66 |
+
def transcribe_menu_model(menu_texts: List[AnyStr],
|
67 |
+
text_summarizer = None,
|
68 |
+
text_tokenizer = None) -> Dict:
|
69 |
+
|
70 |
+
summarized_menu_items = {}
|
71 |
+
|
72 |
+
for mi in menu_texts:
|
73 |
+
if not text_summarizer:
|
74 |
+
raise NotImplementedError(""" """)
|
75 |
+
|
76 |
+
else:
|
77 |
+
prompt_item = INSTRUCTION_PROMPT + " " + mi + """
|
78 |
+
|
79 |
+
|
80 |
+
"""
|
81 |
+
input_ids = text_tokenizer(prompt_item, return_tensors="pt").input_ids
|
82 |
+
|
83 |
+
outputs = text_summarizer.generate(input_ids,
|
84 |
+
max_new_tokens = 512
|
85 |
+
)
|
86 |
+
|
87 |
+
summarized_menu_items[mi] = text_tokenizer.decode(
|
88 |
+
outputs[0],
|
89 |
+
skip_special_tokens = True
|
90 |
+
)
|
91 |
+
|
92 |
+
return summarized_menu_items
|
93 |
+
|
94 |
+
def load_models(item_summarizer: AnyStr) -> Tuple:
|
95 |
+
text_extractor = easyocr.Reader(['en'],
|
96 |
+
gpu = use_gpu
|
97 |
+
)
|
98 |
+
tokenizer = T5Tokenizer.from_pretrained(item_summarizer)
|
99 |
+
model = T5ForConditionalGeneration.from_pretrained(item_summarizer)
|
100 |
+
|
101 |
+
return (text_extractor, tokenizer, model)
|
102 |
+
|
103 |
+
def classify_menu_text(extrc_str: List[AnyStr]) -> List[AnyStr]:
|
104 |
+
return extrc_str
|
105 |
+
|
src/main.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from inference.translate import (
|
4 |
+
extract_filter_img,
|
5 |
+
transcribe_menu_model,
|
6 |
+
load_models
|
7 |
+
)
|
8 |
+
|
9 |
+
from inference.config import DEBUG_MODE
|
10 |
+
from PIL import Image
|
11 |
+
import time
|
12 |
+
|
13 |
+
# Streamlit app
|
14 |
+
st.title("Image Upload and Processing")
|
15 |
+
|
16 |
+
|
17 |
+
# Using open source text detector, LLM for explaining items
|
18 |
+
text_extractor, \
|
19 |
+
item_tokenizer,item_summarizer = load_models(item_summarizer = "google/flan-t5-large")
|
20 |
+
|
21 |
+
# Streamlit function to upload an image from any device
|
22 |
+
uploaded_file = st.file_uploader("Choose an image...",
|
23 |
+
type=["jpg", "jpeg", "png"])
|
24 |
+
|
25 |
+
|
26 |
+
# Submit button
|
27 |
+
if uploaded_file is not None:
|
28 |
+
image = Image.open(uploaded_file)
|
29 |
+
|
30 |
+
# Only show if user wants to see
|
31 |
+
if st.checkbox('Show Uploaded Image'):
|
32 |
+
st.image(image,
|
33 |
+
caption='Uploaded Image',
|
34 |
+
use_column_width=True)
|
35 |
+
|
36 |
+
# Submit button
|
37 |
+
if st.button("Submit"):
|
38 |
+
|
39 |
+
msg1 = st.empty()
|
40 |
+
msg1.write("Pre-processing and extracting text out of your image ....")
|
41 |
+
st_filter = time.perf_counter()
|
42 |
+
# Call the extract_filter_img function
|
43 |
+
filtered_text = extract_filter_img(image, text_extractor)
|
44 |
+
en_filter = time.perf_counter()
|
45 |
+
|
46 |
+
msg2 = st.empty()
|
47 |
+
msg2.write("All pre-processing done, transcribing your menu items now ....")
|
48 |
+
st_trans_llm = time.perf_counter()
|
49 |
+
translated_text_dict = transcribe_menu_model(menu_texts=filtered_text,
|
50 |
+
text_tokenizer=item_tokenizer,
|
51 |
+
text_summarizer=item_summarizer
|
52 |
+
)
|
53 |
+
|
54 |
+
msg3 = st.empty()
|
55 |
+
msg3.write("Done transcribing ... ")
|
56 |
+
en_trans_llm = time.perf_counter()
|
57 |
+
|
58 |
+
msg1.empty(); msg2.empty(); msg3.empty()
|
59 |
+
st.success("Image processed successfully! " )
|
60 |
+
|
61 |
+
if DEBUG_MODE:
|
62 |
+
filter_time_sec = en_filter - st_filter
|
63 |
+
llm_time_sec = en_trans_llm - st_trans_llm
|
64 |
+
total_time_sec = filter_time_sec + llm_time_sec
|
65 |
+
|
66 |
+
st.write("Time took to extract and filter text {}".format(filter_time_sec))
|
67 |
+
st.write("Time took to summarize by LLM {}".format(llm_time_sec))
|
68 |
+
st.write('Overall time taken in seconds: {}'.format(total_time_sec))
|
69 |
+
|
70 |
+
st.table(translated_text_dict)
|
71 |
+
|
src/requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sentencepiece==0.2.0
|
2 |
+
transformers==4.44.2
|
3 |
+
streamlit==1.37.1
|
4 |
+
pandas==2.2.2
|
5 |
+
altair
|
6 |
+
easyocr==1.6.2
|
7 |
+
matplotlib==3.7.1
|
8 |
+
numpy==1.24.2
|
9 |
+
Pillow==9.5.0
|
10 |
+
nltk==3.9.1
|
11 |
+
torch==2.1.0
|