Spaces:

Ransaka
/

OCR-CRNN

Runtime error

App Files Files Community

Ransaka commited on Oct 26, 2024

Commit

232505b

verified ·

1 Parent(s): 93ea391

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -68

app.py CHANGED Viewed

@@ -3,10 +3,30 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torchvision import transforms
-from torchvision.transforms import functional as TF
 from PIL import Image
-from sinlib import Tokenizer
 from pathlib import Path
 MAX_LENGTH = 32
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -14,75 +34,84 @@ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 # Load tokenizer
 @st.cache_resource
 def load_tokenizer():
-    tokenizer = Tokenizer(max_length=1000).load_from_pretrained("gpt2.json")
-    tokenizer.max_length = MAX_LENGTH
     return tokenizer
 tokenizer = load_tokenizer()
 class CRNN(nn.Module):
-    def __init__(self, num_chars):
         super(CRNN, self).__init__()
-        self.cnn = nn.Sequential(
-            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(kernel_size=2, stride=2),
-            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(kernel_size=2, stride=2),
-            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
-            nn.BatchNorm2d(256),
             nn.ReLU(),
-            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(kernel_size=(2, 1)),
-            nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1),
-            nn.BatchNorm2d(512),
-            nn.ReLU(),
-            nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1),
-            nn.ReLU(),
-            nn.MaxPool2d(kernel_size=(2, 1)),
-            nn.Conv2d(512, 512, kernel_size=2, stride=1),
-            nn.BatchNorm2d(512),
-            nn.ReLU()
         )
-        # RNN layers
-        self.rnn = nn.GRU(512 * 7, 256, bidirectional=True, batch_first=True, num_layers=2)
-        self.linear = nn.Linear(512, num_chars)
     def forward(self, x):
-        conv = self.cnn(x)
-        batch, channel, height, width = conv.size()
-        conv = conv.permute(0, 3, 1, 2)
-        conv = conv.contiguous().view(batch, width, channel * height)
-        output, _ = self.rnn(conv)
-        output = self.linear(output)
         return output
 @st.cache_resource
 def load_model(selected_model_path):
-    model = CRNN(num_chars=len(tokenizer))
-    model.load_state_dict(torch.load(f'{selected_model_path}', map_location=torch.device('cpu')))
     model.eval()
     return model
-def preprocess_image(image):
-    transform = transforms.Compose([
-        transforms.Grayscale(),
-        transforms.ToTensor(),
-    ])
-    image = TF.resize(image, (128, 2600), interpolation=Image.BILINEAR)
     image = transform(image)
-    if image.shape[0] != 1:
-        image = image.mean(dim=0, keepdim=True)
-    image = image.unsqueeze(0)
-    return image
 def inference(model, image):
     with torch.no_grad():
         image = image.to(DEVICE)
@@ -91,35 +120,28 @@ def inference(model, image):
         pred_chars = torch.argmax(log_probs, dim=2)
     return pred_chars.squeeze().cpu().numpy()
-st.title("CRNN Printed Text Recognition")
-st.warning("**Note**: This model was trained on images with these settings, \
-           with width ranging from 800 to 2600 pixels and height ranging from 128 to 600 pixels. \
-           For better results, use images within these limitations."
-           )
-fp = Path(".").glob("*.pth")
 selected_model_path = st.selectbox(label="Select Model...", options=fp)
 model = load_model(selected_model_path)
 uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
 if uploaded_file is not None:
-    image = Image.open(uploaded_file)
     st.image(image, caption='Uploaded Image', use_column_width=True)
-    w,h = image.size
-    w_color = h_color  = 'green'
-    if  not 800 <= w <= 2600:
-        w_color = "red"
-    if  not 128 <= h <= 600:
-        h_color = "red"
-    with st.expander("Click See Image Details"):
-        st.write(f"Width = :{w_color}[{w}];",f"Height = :{h_color}[{h}]")
     if st.button('Predict'):
-        processed_image = preprocess_image(image)
-        predicted_sequence = inference(model, processed_image)
-        decoded_text = tokenizer.decode(predicted_sequence, skip_special_tokens=True)
         st.write("Predicted Text:")
-        st.write(decoded_text)
 st.markdown("---")
 st.write("Note: This app uses a pre-trained CRNN model for printed Sinhala text recognition.")

 import torch.nn as nn
 import torch.nn.functional as F
 from torchvision import transforms
 from PIL import Image
 from pathlib import Path
+import pickle
+transform = transforms.Compose([
+            transforms.ToTensor()
+            ])
+class TextProcessor:
+    def __init__(self, alphabet):
+        self.alphabet = alphabet
+        self.pad_token = "[PAD]"
+        self.stoi = {s: i for i, s in enumerate(self.alphabet,1)}
+        self.stoi[self.pad_token] = 0
+        self.itos = {i: s for s, i in self.stoi.items()}
+    def encode(self, label):
+        return [self.stoi[s] for s in label]
+    def decode(self, ids):
+        return ''.join([self.itos[i] for i in ids])
+    def __len__(self):
+        return len(self.alphabet) + 1
 MAX_LENGTH = 32
 DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 # Load tokenizer
 @st.cache_resource
 def load_tokenizer():
+    with open("text_process.cls",'rb') as f:
+        tokenizer = pickle.load(f)
     return tokenizer
 tokenizer = load_tokenizer()
+encode = tokenizer.encode
+decode = tokenizer.decode
 class CRNN(nn.Module):
+    def __init__(self, num_channels, hidden_size, num_classes):
         super(CRNN, self).__init__()
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(1, 64, kernel_size=(2,3), padding=1),
             nn.ReLU(),
+            nn.MaxPool2d(2, 2)
         )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(64, 128, kernel_size=(2,3), padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(2, 2)
+        )
+        self.rnn = nn.LSTM(128 * 16, hidden_size, bidirectional=True, batch_first=True)
+        self.fc = nn.Linear(hidden_size * 2, num_classes)
     def forward(self, x):
+        # x shape: [batch_size, channels, height, width]
+        # CNN feature extraction
+        conv = self.conv1(x)
+        conv = self.conv2(conv)
+        batch, channels, height, width = conv.size()
+        conv = conv.permute(0, 3, 1, 2)  # [batch, width, channels, height]
+        conv = conv.contiguous().view(batch, width, channels * height)
+        rnn, _ = self.rnn(conv)
+        output = self.fc(rnn)
         return output
 @st.cache_resource
 def load_model(selected_model_path):
+    model = CRNN(num_channels=1, hidden_size=256, num_classes=len(tokenizer))
+    model.load_state_dict(torch.load(selected_model_path, map_location=torch.device('cpu')))
     model.eval()
     return model
+def preprocess_image(img):
+    # img = image.convert("L")  # Ensuring image is in grayscale
+    original_width, original_height = img.size
+    new_width = int(61 * original_width / original_height)  # Calculate width to preserve aspect ratio
+    image = img.resize((new_width, 61))
     image = transform(image)
+    return image
+def post_process(preds):
+    encodings = []
+    is_previous_zero = False
+    for pred in preds:
+        #only considering >0 tokens
+        if pred==0:
+            zero_found = True
+            pass
+        elif not encodings:
+            encodings.append(pred)
+        elif encodings[-1] != pred:
+            encodings.append(pred)
+    return decode(encodings)
 def inference(model, image):
     with torch.no_grad():
         image = image.to(DEVICE)
         pred_chars = torch.argmax(log_probs, dim=2)
     return pred_chars.squeeze().cpu().numpy()
+def predict(image):
+    image = preprocess_image(image)
+    image = image.unsqueeze(0) #remove batch dim
+    predictions = model(image)
+    pred_ids = torch.argmax(predictions, dim=-1).detach().flatten().tolist()
+    text = post_process(pred_ids)
+    return text
+st.title("CRNN Sinhala Printed Text Recognition")
+fp = Path(".").glob("crnn*.pt")
 selected_model_path = st.selectbox(label="Select Model...", options=fp)
 model = load_model(selected_model_path)
 uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
 if uploaded_file is not None:
+    image = Image.open(uploaded_file).convert("L")
     st.image(image, caption='Uploaded Image', use_column_width=True)
     if st.button('Predict'):
+        predicted_text = predict(image)
         st.write("Predicted Text:")
+        st.write(predicted_text)
 st.markdown("---")
 st.write("Note: This app uses a pre-trained CRNN model for printed Sinhala text recognition.")