Spaces:
Sleeping
Sleeping
Initial commit
Browse files
app.py
CHANGED
@@ -2,8 +2,7 @@ import warnings
|
|
2 |
import torchvision
|
3 |
import torch
|
4 |
import pandas as pd
|
5 |
-
from transformers
|
6 |
-
from transformers import AutoTokenizer, AutoModel
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
import streamlit as st
|
9 |
|
@@ -11,56 +10,41 @@ import streamlit as st
|
|
11 |
torchvision.disable_beta_transforms_warning()
|
12 |
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
|
13 |
|
14 |
-
# Initialize fill-mask pipeline and model/tokenizer for embedding
|
15 |
-
pipe = pipeline(
|
16 |
-
|
|
|
|
|
|
|
|
|
17 |
model = AutoModel.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
|
18 |
|
19 |
# Function to generate embeddings for full sentences
|
20 |
def get_embedding(text):
|
21 |
-
inputs = tokenizer(text, return_tensors="pt")
|
22 |
with torch.no_grad():
|
23 |
outputs = model(**inputs)
|
24 |
return outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
25 |
|
26 |
# Streamlit app setup
|
27 |
st.title("Thai Full Sentence Similarity App")
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
In this example,
|
35 |
-
-
|
36 |
-
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
### Potential Predictions
|
41 |
-
Here are some possible predictions the model might generate for `<mask>`:
|
42 |
-
1. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"` - Chiang Mai
|
43 |
-
2. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เขาใหญ่ เพื่อสัมผัสธรรมชาติ"` - Khao Yai
|
44 |
-
3. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เกาะสมุย เพื่อสัมผัสธรรมชาติ"` - Koh Samui
|
45 |
-
4. `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน ภูเก็ต เพื่อสัมผัสธรรมชาติ"` - Phuket
|
46 |
-
|
47 |
-
### Results Table
|
48 |
-
For each prediction, the app calculates:
|
49 |
-
- **Similarity Score**: Indicates how similar the predicted sentence is to the original input.
|
50 |
-
- **Model Score**: Represents the model's confidence in the predicted word for `<mask>`.
|
51 |
-
|
52 |
-
### Most Similar Prediction
|
53 |
-
The app will display the most contextually similar prediction based on the similarity score. For example:
|
54 |
-
- **Most Similar Prediction**: `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน เชียงใหม่ เพื่อสัมผัสธรรมชาติ"`
|
55 |
-
- **Similarity Score**: 0.89
|
56 |
-
- **Model Score**: 0.16
|
57 |
-
|
58 |
-
Feel free to enter your own sentence with `<mask>` and explore the predictions!
|
59 |
""")
|
60 |
|
61 |
# User input box
|
62 |
st.subheader("Input Text")
|
63 |
-
input_text = st.text_input("Enter a sentence with `<mask>` to find similar predictions:", "
|
64 |
|
65 |
# Ensure the input includes a `<mask>`
|
66 |
if "<mask>" not in input_text:
|
|
|
2 |
import torchvision
|
3 |
import torch
|
4 |
import pandas as pd
|
5 |
+
from transformers import pipeline, AutoTokenizer, AutoModel
|
|
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
import streamlit as st
|
8 |
|
|
|
10 |
torchvision.disable_beta_transforms_warning()
|
11 |
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
|
12 |
|
13 |
+
# Initialize fill-mask pipeline and model/tokenizer for embedding with slow tokenizer
|
14 |
+
pipe = pipeline(
|
15 |
+
"fill-mask",
|
16 |
+
model="airesearch/wangchanberta-base-att-spm-uncased",
|
17 |
+
tokenizer=AutoTokenizer.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased", use_fast=False),
|
18 |
+
framework="pt"
|
19 |
+
)
|
20 |
model = AutoModel.from_pretrained("airesearch/wangchanberta-base-att-spm-uncased")
|
21 |
|
22 |
# Function to generate embeddings for full sentences
|
23 |
def get_embedding(text):
|
24 |
+
inputs = pipe.tokenizer(text, return_tensors="pt")
|
25 |
with torch.no_grad():
|
26 |
outputs = model(**inputs)
|
27 |
return outputs.last_hidden_state[:, 0, :].cpu().numpy()
|
28 |
|
29 |
# Streamlit app setup
|
30 |
st.title("Thai Full Sentence Similarity App")
|
31 |
+
|
32 |
+
# Explanation of example usage
|
33 |
+
st.markdown("""
|
34 |
+
### Example Sentence with Mask:
|
35 |
+
**Input:** `"นักท่องเที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ"`
|
36 |
+
|
37 |
+
In this example, the model will replace `<mask>` with possible locations in Thailand, such as:
|
38 |
+
- "เชียงใหม่" for "Chiang Mai"
|
39 |
+
- "เขาใหญ่" for "Khao Yai"
|
40 |
+
- "ภูเก็ต" for "Phuket"
|
41 |
+
|
42 |
+
The app will compute the similarity between the full sentences generated and the baseline sentence without `<mask>`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
""")
|
44 |
|
45 |
# User input box
|
46 |
st.subheader("Input Text")
|
47 |
+
input_text = st.text_input("Enter a sentence with `<mask>` to find similar predictions:", "นักท่อ���เที่ยวจำนวนมากเลือกที่จะไปเยือน <mask> เพื่อสัมผัสธรรมชาติ")
|
48 |
|
49 |
# Ensure the input includes a `<mask>`
|
50 |
if "<mask>" not in input_text:
|