File size: 8,408 Bytes
deaec20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3404a18
deaec20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d3baca2
deaec20
 
 
d3baca2
deaec20
 
 
d3baca2
deaec20
 
3404a18
d3baca2
 
 
 
 
 
 
 
 
 
 
 
 
deaec20
 
 
 
3404a18
deaec20
 
 
 
 
 
 
 
 
 
 
 
 
3404a18
deaec20
 
 
 
3404a18
deaec20
 
 
 
 
 
 
 
3404a18
deaec20
d3baca2
 
deaec20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3404a18
deaec20
d3baca2
 
deaec20
 
 
 
 
 
 
 
 
 
d3baca2
deaec20
 
 
 
 
 
 
 
 
 
 
 
 
3404a18
deaec20
 
3404a18
deaec20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf4d19f
deaec20
 
 
 
 
d3baca2
deaec20
 
 
 
 
 
d3baca2
deaec20
 
 
 
d3baca2
 
deaec20
 
 
 
 
 
ee6e98a
bf4d19f
deaec20
 
efa9e01
d3baca2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# -*- coding: utf-8 -*-
"""disaster_help_ner_production.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_wqnv01NeDdOLp2X1KT6WfvNgVwwTu5a

Note: This is a continuation of [this](https://colab.research.google.com/drive/1HlZLIVcAtWVeitZjWV3LclrH3gBwuymp?usp=sharing) notebook:

# Imports
"""


import json
import requests
import configparser

import spacy
import spacy_transformers
from spacy import displacy
from spacy.tokens import Span
from transformers import pipeline
from spacy.matcher import PhraseMatcher

import csv
import numpy as np
import pandas as pd

import geopy
import gradio as gr
from geopy.geocoders import Nominatim

"""# Telegram """

# bot_token = 'get from user using gradio'

offset = None

def get_data(bot_token):
    global offset
    try:
        if offset == None:
            response = requests.get("https://api.telegram.org/bot{}/getUpdates".format(bot_token))
            response_json = json.loads(response.text)
            last_update_id = int(response_json['result'][-1]['update_id'])
            # without 'last_update_id + 1' there will be duplicate results
            offset = last_update_id + 1
        else:
            response = requests.get('https://api.telegram.org/bot{}/getUpdates?offset={}'.format(bot_token, offset))
            response_json = json.loads(response.text)
            last_update_id = int(response_json['result'][-1]['update_id'])
            # without 'last_update_id + 1' there will be duplicate results
            offset = last_update_id + 1
        text_list = [r['channel_post']['text'] for r in response_json['result']]
        return text_list
    except KeyError:
        # print('An error occurred. Possibly empty request result or your Telegram Bot Token is incorrect.')
        error_list = ['An error occurred. Possibly empty request result or your Telegram Bot Token is incorrect.']
        return error_list
    except Exception as e:
        # print('An error occurred. Possibly empty request result or your Telegram Bot Token is incorrect.') #, e
        error_list = ['An error occurred. Possibly empty request result or your Telegram Bot Token is incorrect.']
        return error_list


"""# Classifier"""

def classify_message(bot_token):
  error_msg = ['An error occurred. Possibly empty request result or your Telegram Bot Token is incorrect.']
  disaster_docs = []
  classifier = pipeline("sentiment-analysis", model="Madhana/disaster_msges_classifier_v1")
  results = []
  for data in get_data(bot_token):
    if data == error_msg[0]:
      return error_msg
    classification = classifier(data)
    label = classification[0]['label']
    results.append((data, label))
    if label == 'DISASTER':
      disaster_docs.append(data)
  return disaster_docs

"""# NER Pipeline"""

@spacy.Language.component("disaster_ner")
def disaster_ner(doc):
    matcher = PhraseMatcher(doc.vocab)
    patterns = list(nlp.tokenizer.pipe(Tamil_words))
    matcher.add("Tamil_words", None, *patterns)
    matches = matcher(doc)
    spans = [Span(doc, start, end, label="YO!") for match_id, start, end in matches]
    doc.ents = spans
    return doc

Tamil_words = ['மதனா பாலா'] # umm, that's my name in Tamil, consider this as a easter egg in this app lol.

nlp = spacy.load("en_pipeline")
nlp.add_pipe("disaster_ner", name="disaster_ner", before='ner')

def create_address(row):
    return f"{row['STREET']}, {row['NEIGHBORHOOD']}, {row['CITY']}"

geolocator = Nominatim(user_agent="disaster-ner-app")

def geocode_address(address):
    try:
        location = geolocator.geocode(address)
        return (location.latitude, location.longitude)
    except:
        return None

"""# With Classifier"""

def get_classifier_ner(bot_token):
  data = classify_message(bot_token)
  entity_types = ["NAME", "STREET", "NEIGHBORHOOD", "CITY", "PHONE NUMBER","YO!"]
  df = pd.DataFrame(columns=["Text"] + entity_types)

  for text in data:
    doc = nlp(text)
    row = [text]
    entities = {ent.label_: ent.text for ent in doc.ents}
    for entity_type in entity_types:
        row.append(entities.get(entity_type, ""))
    # html = displacy.render(doc, style="ent")
    # row.append(html)

    num_cols = len(df.columns)
    while len(row) < num_cols:
      row.append("")

    df.loc[len(df)] = row
  
  df['Address'] = df.apply(create_address, axis=1)
  df['Coordinates'] = df['Address'].apply(geocode_address)

  return df

"""## Without Classifier"""

def get_ner(bot_token):
  data = get_data(bot_token)
  entity_types = ["NAME", "STREET", "NEIGHBORHOOD", "CITY", "PHONE NUMBER","YO!"]
  df = pd.DataFrame(columns=["Text"] + entity_types)

  for text in data:
    doc = nlp(text)
    row = [text]
    entities = {ent.label_: ent.text for ent in doc.ents}
    for entity_type in entity_types:
        row.append(entities.get(entity_type, ""))
    # html = displacy.render(doc, style="ent")
    # row.append(html)

    
    num_cols = len(df.columns)
    while len(row) < num_cols:
      row.append("")

    df.loc[len(df)] = row
  
  df['Address'] = df.apply(create_address, axis=1)
  df['Coordinates'] = df['Address'].apply(geocode_address)

  return df

"""# Gradio"""

def process_ner_data(your_bot_token):
    return get_ner(your_bot_token)

def process_classifier_ner_data(your_bot_token):
    return get_classifier_ner(your_bot_token)

demo = gr.Blocks()

with demo:
    gr.Markdown("Telegram Disaster Recovery Assistant")
    with gr.Tabs():
        with gr.TabItem("Structured Telegram Messages"):
            with gr.Row():
                your_bot_token = gr.Textbox(type='password', label="Enter your Bot Token")
                ner_df = gr.Dataframe(headers=["NAME", "STREET", "NEIGHBORHOOD", "CITY", "PHONE NUMBER","YO!"])
                
                
            classifier_ner_button = gr.Button("Get Classifier-NER Output")
            ner_button = gr.Button("Get NER Output")
            clear = gr.Button("Clear")
            
            
        with gr.TabItem("User Guide"):
            with gr.Row(): 
              gr.Markdown("""This is an Telegram based Disaster Recovery Assist app that uses Named Entity Recognition to extract important entities from the unstructured text and stores it in an dataframe. 
              You need to provide your personal Telegram Bot API token (API token of the bot that is added to the channel as an administrator) to use this app.
              **Steps to create a Telegram Bot**: 
              1. Download the Telegram app on your device or use the web version. 
              2. Open the app and search for the "BotFather" bot. 
              3. Start a chat with the BotFather bot by clicking on the "START" button. 
              4. Type "/newbot" and follow the on-screen instructions to create a new bot. 
              5. Choose a name and username for your bot. \6. Once your bot is created, the BotFather will give you a unique API token.
              
              **Steps to add your telegram bot to your channel as an administrator**: 
              1. Create a new channel or choose an existing one that you want to use the bot in. 
              2. Add your bot to the channel as an administrator. To do this, go to the channel settings, click on "Administrators", and then click on "Add Administrator". Search for your bot and add it to the channel. 
              3. Now you can send commands to the bot in the channel by mentioning the bot using the "@" symbol followed by the bot's username. For example, "@my_bot help" will send the "help" command to the bot.
              
              **Get Classifier-NER Output VS Get NER Output**:
              The 'Get Classifier Ner Output' function first classifies the message as either a disaster message or a random message, and then applies the NER pipeline to the classified output. In contrast, the 'Get NER Output' function applies the NER pipeline directly to the message.
              *If you get any errors or dependency issues, feel free to reach out to me!*""")
                        

    
    ner_button.click(process_ner_data,inputs=your_bot_token, outputs=ner_df)
    classifier_ner_button.click(process_classifier_ner_data,inputs=your_bot_token, outputs=ner_df)
    clear.click(lambda: None, None, ner_df, queue=True)
   
    
demo.queue(concurrency_count=3)
demo.launch() # share=True, debug=True