👜 baseline
Browse files- README.md +5 -29
- app.py +45 -0
- grand_historian.csv +0 -0
- requirements.txt +5 -0
README.md
CHANGED
@@ -1,37 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: purple
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
|
|
|
|
10 |
|
11 |
-
# Configuration
|
12 |
|
13 |
-
`title`: _string_
|
14 |
-
Display title for the Space
|
15 |
-
|
16 |
-
`emoji`: _string_
|
17 |
-
Space emoji (emoji-only character allowed)
|
18 |
-
|
19 |
-
`colorFrom`: _string_
|
20 |
-
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
21 |
-
|
22 |
-
`colorTo`: _string_
|
23 |
-
Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
|
24 |
-
|
25 |
-
`sdk`: _string_
|
26 |
-
Can be either `gradio`, `streamlit`, or `static`
|
27 |
-
|
28 |
-
`sdk_version` : _string_
|
29 |
-
Only applicable for `streamlit` SDK.
|
30 |
-
See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
|
31 |
-
|
32 |
-
`app_file`: _string_
|
33 |
-
Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
|
34 |
-
Path is relative to the root of the repository.
|
35 |
-
|
36 |
-
`pinned`: _boolean_
|
37 |
-
Whether the Space stays on top of your list.
|
|
|
1 |
---
|
2 |
+
title: Cross language search
|
3 |
+
emoji: ⚔️
|
4 |
+
colorFrom: indigo
|
5 |
colorTo: purple
|
6 |
sdk: streamlit
|
7 |
app_file: app.py
|
8 |
pinned: false
|
9 |
---
|
10 |
+
# Cross Language Search
|
11 |
+
> Search ancient books with modern words
|
12 |
|
|
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from forgebox.cosine import CosineSearch
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
TAG = "raynardj/xlsearch-cross-lang-search-zh-vs-classicical-cn"
|
8 |
+
|
9 |
+
@st.cache(allow_output_mutation=True)
|
10 |
+
def load_encoder():
|
11 |
+
with st.spinner(f"Loading Transformer:{TAG}"):
|
12 |
+
encoder = SentenceTransformer(TAG)
|
13 |
+
return encoder
|
14 |
+
|
15 |
+
encoder = load_encoder()
|
16 |
+
|
17 |
+
@st.cache(allow_output_mutation=True)
|
18 |
+
def load_book():
|
19 |
+
with st.spinner(f"📚 Loading Book..."):
|
20 |
+
df = pd.read_csv("grand_historian.csv")
|
21 |
+
return list(df.sentence)
|
22 |
+
|
23 |
+
all_lines = load_book()
|
24 |
+
|
25 |
+
@st.cache(allow_output_mutation=True)
|
26 |
+
def encode_book():
|
27 |
+
with st.spinner(f"Encoding sentences for book《Records of the Grand Historian》"):
|
28 |
+
vec = encoder.encode(all_lines, batch_size=64, show_progress_bar=True)
|
29 |
+
cosine = CosineSearch(vec)
|
30 |
+
return cosine
|
31 |
+
|
32 |
+
cosine = encode_book()
|
33 |
+
|
34 |
+
def search(text):
|
35 |
+
enc = encoder.encode(text) # encode the search key
|
36 |
+
order = cosine(enc) # distance array
|
37 |
+
sentence_df = pd.DataFrame({"sentence":np.array(all_lines)[order[:5]]})
|
38 |
+
return sentence_df
|
39 |
+
|
40 |
+
keyword = st.text_input("用白话搜", "")
|
41 |
+
if st.button("搜索"):
|
42 |
+
if keyword:
|
43 |
+
with st.spinner(f"🔍 Searching for {keyword}"):
|
44 |
+
df = search(keyword)
|
45 |
+
st.table(df)
|
grand_historian.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch==1.7.1
|
2 |
+
sentence-transformers==2.1.0
|
3 |
+
transformers==4.12.3
|
4 |
+
pandas==1.3.5
|
5 |
+
forgebox==0.4.20
|