Spaces:
Runtime error
Runtime error
Upload 23 files
Browse files- .gitattributes +1 -0
- Dockerfile +20 -0
- Hotel New York Combined.csv +0 -0
- README.md +1 -13
- Untitled.ipynb +90 -0
- app.py +297 -0
- app.yaml +3 -0
- basic.py +166 -0
- combined_paris.csv +0 -0
- corpus_embeddings_bi_encoder.pickle +3 -0
- corpus_embeddings_bi_encoder.pickle 2 +0 -0
- df_combined.csv +0 -0
- df_combined_paris.csv +0 -0
- embeddings.npy +3 -0
- embeddings_h_r.npy +3 -0
- embeddings_review.npy +3 -0
- en_core_web_sm-3.2.0-py3-none-any.whl +3 -0
- paris-newer.py +295 -0
- paris.py +298 -0
- paris_clean_newer.csv +0 -0
- query_generator.ipynb +0 -0
- requirements.txt +14 -0
- summary.ipynb +654 -0
- tokenized_corpus.pickle +3 -0
.gitattributes
CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
31 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
32 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
33 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
34 |
+
en_core_web_sm-3.2.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#Base Image to use
|
2 |
+
FROM python:3.7.9
|
3 |
+
|
4 |
+
#Expose port 8080
|
5 |
+
EXPOSE 8080
|
6 |
+
|
7 |
+
#Copy Requirements.txt file into app directory
|
8 |
+
COPY requirements.txt app/requirements.txt
|
9 |
+
|
10 |
+
#install all requirements in requirements.txt
|
11 |
+
RUN pip3 install -r app/requirements.txt
|
12 |
+
|
13 |
+
#Copy all files in current directory into app directory
|
14 |
+
COPY . /app
|
15 |
+
|
16 |
+
#Change Working Directory to app directory
|
17 |
+
WORKDIR /app
|
18 |
+
|
19 |
+
#Run the application on port 8080
|
20 |
+
ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8080", "--server.address=0.0.0.0"]
|
Hotel New York Combined.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
README.md
CHANGED
@@ -1,13 +1 @@
|
|
1 |
-
|
2 |
-
title: Parishotel
|
3 |
-
emoji: 🐠
|
4 |
-
colorFrom: yellow
|
5 |
-
colorTo: blue
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.10.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
+
assignment3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Untitled.ipynb
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "611a3e0e",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stdout",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"Init Plugin\n",
|
14 |
+
"Init Graph Optimizer\n",
|
15 |
+
"Init Kernel\n",
|
16 |
+
"Collecting en-core-web-sm==3.2.0\n",
|
17 |
+
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)\n",
|
18 |
+
" |████████████████████████████████| 13.9 MB 463 kB/s \n",
|
19 |
+
"\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from en-core-web-sm==3.2.0) (3.2.1)\n",
|
20 |
+
"Requirement already satisfied: blis<0.8.0,>=0.4.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.7.5)\n",
|
21 |
+
"Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.13)\n",
|
22 |
+
"Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.2)\n",
|
23 |
+
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.62.3)\n",
|
24 |
+
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)\n",
|
25 |
+
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
|
26 |
+
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
|
27 |
+
"Requirement already satisfied: packaging>=20.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (21.0)\n",
|
28 |
+
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3.0)\n",
|
29 |
+
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.8)\n",
|
30 |
+
"Requirement already satisfied: jinja2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.1)\n",
|
31 |
+
"Requirement already satisfied: numpy>=1.15.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.21.4)\n",
|
32 |
+
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.8.2)\n",
|
33 |
+
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.1)\n",
|
34 |
+
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.6)\n",
|
35 |
+
"Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.8.2)\n",
|
36 |
+
"Requirement already satisfied: pathy>=0.3.5 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.6.1)\n",
|
37 |
+
"Requirement already satisfied: typer<0.5.0,>=0.3.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.4.0)\n",
|
38 |
+
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.26.0)\n",
|
39 |
+
"Requirement already satisfied: setuptools in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (59.0.1)\n",
|
40 |
+
"Requirement already satisfied: pyparsing>=2.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.7)\n",
|
41 |
+
"Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (5.2.1)\n",
|
42 |
+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.1.1)\n",
|
43 |
+
"Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.2)\n",
|
44 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2021.5.30)\n",
|
45 |
+
"Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.4)\n",
|
46 |
+
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.26.6)\n",
|
47 |
+
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.4)\n",
|
48 |
+
"Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.1)\n",
|
49 |
+
"\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
|
50 |
+
"You should consider upgrading via the '/opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/bin/python -m pip install --upgrade pip' command.\u001b[0m\n",
|
51 |
+
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
|
52 |
+
"You can now load the package via spacy.load('en_core_web_sm')\n"
|
53 |
+
]
|
54 |
+
}
|
55 |
+
],
|
56 |
+
"source": [
|
57 |
+
"!python -m spacy download en_core_web_sm"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "code",
|
62 |
+
"execution_count": null,
|
63 |
+
"id": "51a414e5",
|
64 |
+
"metadata": {},
|
65 |
+
"outputs": [],
|
66 |
+
"source": []
|
67 |
+
}
|
68 |
+
],
|
69 |
+
"metadata": {
|
70 |
+
"kernelspec": {
|
71 |
+
"display_name": "Python 3.9.5 64-bit ('tensorflow': conda)",
|
72 |
+
"language": "python",
|
73 |
+
"name": "python395jvsc74a57bd04bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676"
|
74 |
+
},
|
75 |
+
"language_info": {
|
76 |
+
"codemirror_mode": {
|
77 |
+
"name": "ipython",
|
78 |
+
"version": 3
|
79 |
+
},
|
80 |
+
"file_extension": ".py",
|
81 |
+
"mimetype": "text/x-python",
|
82 |
+
"name": "python",
|
83 |
+
"nbconvert_exporter": "python",
|
84 |
+
"pygments_lexer": "ipython3",
|
85 |
+
"version": "3.9.5"
|
86 |
+
}
|
87 |
+
},
|
88 |
+
"nbformat": 4,
|
89 |
+
"nbformat_minor": 5
|
90 |
+
}
|
app.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
@author: Hamza Farooq
|
7 |
+
"""
|
8 |
+
|
9 |
+
import spacy
|
10 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
11 |
+
from string import punctuation
|
12 |
+
from collections import Counter
|
13 |
+
from heapq import nlargest
|
14 |
+
import os
|
15 |
+
nlp = spacy.load("en_core_web_sm")
|
16 |
+
from spacy import displacy
|
17 |
+
import streamlit as st
|
18 |
+
import matplotlib.pyplot as plt
|
19 |
+
from wordcloud import WordCloud
|
20 |
+
from matplotlib import pyplot as plt
|
21 |
+
import nltk
|
22 |
+
nltk.download('stopwords')
|
23 |
+
import geonamescache
|
24 |
+
|
25 |
+
import os
|
26 |
+
import streamlit as st
|
27 |
+
import utils as utl
|
28 |
+
from PIL import Image
|
29 |
+
import time
|
30 |
+
import torch
|
31 |
+
import transformers
|
32 |
+
from transformers import BartTokenizer, BartForConditionalGeneration
|
33 |
+
tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
34 |
+
mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
|
35 |
+
torch_device = 'gpu'
|
36 |
+
|
37 |
+
|
38 |
+
def main():
|
39 |
+
# Settings
|
40 |
+
st.set_page_config(layout="wide", page_title='New York Hotels')
|
41 |
+
def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
|
42 |
+
|
43 |
+
text = text.replace('\n','')
|
44 |
+
text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
|
45 |
+
summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
|
46 |
+
summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
|
47 |
+
return summary_txt
|
48 |
+
|
49 |
+
|
50 |
+
gc = geonamescache.GeonamesCache()
|
51 |
+
|
52 |
+
# gets nested dictionary for countries
|
53 |
+
countries = gc.get_countries()
|
54 |
+
|
55 |
+
# gets nested dictionary for cities
|
56 |
+
cities = gc.get_cities()
|
57 |
+
# def gen_dict_extract(var, key):
|
58 |
+
# if isinstance(var, dict):
|
59 |
+
# for k, v in var.items():
|
60 |
+
# if k == key:
|
61 |
+
# yield v
|
62 |
+
# if isinstance(v, (dict, list)):
|
63 |
+
# yield from gen_dict_extract(v, key)
|
64 |
+
# elif isinstance(var, list):
|
65 |
+
# for d in var:
|
66 |
+
# yield from gen_dict_extract(d, key)
|
67 |
+
#
|
68 |
+
# cities = [*gen_dict_extract(cities, 'name')]
|
69 |
+
# countries = [*gen_dict_extract(countries, 'name')]
|
70 |
+
#
|
71 |
+
# cities.append('New York')
|
72 |
+
|
73 |
+
from nltk.corpus import stopwords
|
74 |
+
|
75 |
+
stopwords = set(stopwords.words('english'))
|
76 |
+
#mask = np.array(Image.open('upvote.png'))
|
77 |
+
|
78 |
+
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
|
79 |
+
import matplotlib.pyplot as plt
|
80 |
+
#original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
|
81 |
+
st.title("New York Hotel Finder")
|
82 |
+
|
83 |
+
|
84 |
+
stopwords=list(STOP_WORDS)
|
85 |
+
stopwords.extend(['hotel','room','rooms'])
|
86 |
+
from string import punctuation
|
87 |
+
punctuation=punctuation+ '\n'
|
88 |
+
|
89 |
+
import pandas as pd
|
90 |
+
from sentence_transformers import SentenceTransformer
|
91 |
+
import scipy.spatial
|
92 |
+
import pickle as pkl
|
93 |
+
from sentence_transformers import SentenceTransformer, util
|
94 |
+
import torch
|
95 |
+
#import os
|
96 |
+
|
97 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
98 |
+
|
99 |
+
df_all = pd.read_csv('Hotel New York Combined.csv')
|
100 |
+
|
101 |
+
df_all = df_all[['hotel_name','review_body']]
|
102 |
+
#
|
103 |
+
# df['hotel_name'].drop_duplicates()
|
104 |
+
|
105 |
+
# df_combined = df.sort_values(['hotel_name']).groupby('hotel_name', sort=False).review_body.apply(''.join).reset_index(name='all_review')
|
106 |
+
|
107 |
+
import re
|
108 |
+
|
109 |
+
df_combined = pd.read_csv('df_combined.csv')
|
110 |
+
|
111 |
+
# df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
|
112 |
+
#
|
113 |
+
# def lower_case(input_str):
|
114 |
+
# input_str = input_str.lower()
|
115 |
+
# return input_str
|
116 |
+
#
|
117 |
+
# df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
|
118 |
+
|
119 |
+
df = df_combined
|
120 |
+
|
121 |
+
df_sentences = df_combined.set_index("all_review")
|
122 |
+
|
123 |
+
df_sentences = df_sentences["hotel_name"].to_dict()
|
124 |
+
df_sentences_list = list(df_sentences.keys())
|
125 |
+
|
126 |
+
import pandas as pd
|
127 |
+
from tqdm import tqdm
|
128 |
+
from sentence_transformers import SentenceTransformer, util
|
129 |
+
|
130 |
+
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
|
131 |
+
#
|
132 |
+
corpus = df_sentences_list
|
133 |
+
corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
|
134 |
+
#
|
135 |
+
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
136 |
+
# paraphrases = util.paraphrase_mining(model, corpus)
|
137 |
+
|
138 |
+
#queries = ['Hotel close to Central Park',
|
139 |
+
# 'Hotel with breakfast'
|
140 |
+
# ]
|
141 |
+
|
142 |
+
|
143 |
+
# from transformers import AutoTokenizer, AutoModel
|
144 |
+
# import torch
|
145 |
+
# import torch.nn.functional as F
|
146 |
+
#
|
147 |
+
# #Mean Pooling - Take attention mask into account for correct averaging
|
148 |
+
# def mean_pooling(model_output, attention_mask):
|
149 |
+
# token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
150 |
+
# input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
151 |
+
# return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
152 |
+
#
|
153 |
+
#
|
154 |
+
# # Sentences we want sentence embeddings for
|
155 |
+
# sentences = corpus
|
156 |
+
#
|
157 |
+
# # Load model from HuggingFace Hub
|
158 |
+
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
|
159 |
+
# model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
|
160 |
+
#
|
161 |
+
# # Tokenize sentences
|
162 |
+
# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
163 |
+
#
|
164 |
+
# # Compute token embeddings
|
165 |
+
# with torch.no_grad():
|
166 |
+
# model_output = model(**encoded_input)
|
167 |
+
#
|
168 |
+
# # Perform pooling
|
169 |
+
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
170 |
+
#
|
171 |
+
# # Normalize embeddings
|
172 |
+
# sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
173 |
+
#
|
174 |
+
# st.text("Sentence embeddings:")
|
175 |
+
# st.text(sentence_embeddings)
|
176 |
+
#
|
177 |
+
#
|
178 |
+
|
179 |
+
#corpus_embeddings = sentence_embeddings
|
180 |
+
# Query sentences
|
181 |
+
|
182 |
+
def plot_cloud(wordcloud):
|
183 |
+
# Set figure size
|
184 |
+
st.pyplot.figure(figsize=(40, 30))
|
185 |
+
# Display image
|
186 |
+
st.pyplot(wordcloud)
|
187 |
+
# No axis details
|
188 |
+
#st.pyplot.axis("off");
|
189 |
+
userinput = st.text_input('Tell us what are you looking in your hotel?')
|
190 |
+
if not userinput:
|
191 |
+
st.write("Please enter a query to get results")
|
192 |
+
else:
|
193 |
+
query = [str(userinput)]
|
194 |
+
doc = nlp(str(userinput))
|
195 |
+
for ent in doc.ents:
|
196 |
+
if ent.label_ == 'GPE':
|
197 |
+
if ent.text in countries:
|
198 |
+
st.write(f"Country : {ent.text}")
|
199 |
+
elif ent.text in cities:
|
200 |
+
st.write("city")
|
201 |
+
st.write(ent.text)
|
202 |
+
st.write(f"City : {ent.text}")
|
203 |
+
else:
|
204 |
+
print(f"Other GPE : {ent.text}")
|
205 |
+
# query_embeddings = embedder.encode(queries,show_progress_bar=True)
|
206 |
+
top_k = min(5, len(corpus))
|
207 |
+
|
208 |
+
query_embedding = embedder.encode(query, convert_to_tensor=True)
|
209 |
+
|
210 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
211 |
+
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
|
212 |
+
top_results = torch.topk(cos_scores, k=top_k)
|
213 |
+
|
214 |
+
# st.write("\n\n======================\n\n")
|
215 |
+
# st.write("Query:", query)
|
216 |
+
# # doc = nlp(query)
|
217 |
+
sentence_spans = list(doc.sents)
|
218 |
+
ent_html = displacy.render(doc, style="ent", jupyter=False)
|
219 |
+
# Display the entity visualization in the browser:
|
220 |
+
st.markdown(ent_html, unsafe_allow_html=True)
|
221 |
+
|
222 |
+
#displacy.render(doc, jupyter = True, style="ent")
|
223 |
+
st.write("##")
|
224 |
+
st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
|
225 |
+
st.write("\n\n======================\n\n")
|
226 |
+
|
227 |
+
for score, idx in zip(top_results[0], top_results[1]):
|
228 |
+
|
229 |
+
row_dict = df.loc[df['all_review']== corpus[idx]]
|
230 |
+
st.subheader(row_dict['hotel_name'].values[0])
|
231 |
+
hotel_subset = df_all.loc[df_all['hotel_name']==row_dict['hotel_name'].values[0]]
|
232 |
+
st.caption("Review Summary:")
|
233 |
+
st.write(row_dict['summary'].values[0])
|
234 |
+
st.caption("Relevancy: {:.4f}".format(score))
|
235 |
+
st.caption("Relevant reviews:")
|
236 |
+
|
237 |
+
df_sentences_h = hotel_subset.set_index("review_body")
|
238 |
+
|
239 |
+
df_sentences_h = df_sentences_h["hotel_name"].to_dict()
|
240 |
+
df_sentences_list_h = list(df_sentences_h.keys())
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
|
245 |
+
#
|
246 |
+
corpus_h = df_sentences_list_h
|
247 |
+
corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
|
248 |
+
cos_scores_h = util.pytorch_cos_sim(query_embedding, corpus_embeddings_h)[0]
|
249 |
+
top_results_h = torch.topk(cos_scores_h, k=top_k)
|
250 |
+
|
251 |
+
for score, idx in zip(top_results_h[0], top_results_h[1]):
|
252 |
+
st.write(corpus_h[idx])
|
253 |
+
|
254 |
+
# st.table(hotel_subset.head())
|
255 |
+
|
256 |
+
# st.write("#")
|
257 |
+
#wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='navy', colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(corpus[idx])
|
258 |
+
# wordcloud = WordCloud(collocations=False,stopwords=stopwords,background_color='black',max_words=35).generate(corpus[idx])
|
259 |
+
# fig, ax = plt.subplots()
|
260 |
+
# plt.imshow(wordcloud, interpolation='bilinear')
|
261 |
+
# plt.axis("off")
|
262 |
+
# plt.show()
|
263 |
+
# st.pyplot(fig)
|
264 |
+
# st.set_option('deprecation.showPyplotGlobalUse', False)
|
265 |
+
|
266 |
+
|
267 |
+
if __name__ == '__main__':
|
268 |
+
main()
|
269 |
+
|
270 |
+
|
271 |
+
# cos_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
|
272 |
+
# top_results = torch.topk(cos_scores, k=top_k)
|
273 |
+
|
274 |
+
# st.write("\n\n======================\n\n")
|
275 |
+
# st.write("Query:", query)
|
276 |
+
# st.write("\nTop 5 most similar sentences in corpus using sentence embedding:")
|
277 |
+
#
|
278 |
+
# for score, idx in zip(top_results[0], top_results[1]):
|
279 |
+
# st.write("(Score: {:.4f})".format(score))
|
280 |
+
# row_dict = df.loc[df['all_review']== corpus[idx]]
|
281 |
+
# st.write("paper_id: " , row_dict['hotel_name'] , "\n")
|
282 |
+
# #wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='navy', colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(corpus[idx])
|
283 |
+
# wordcloud = WordCloud(collocations=False,stopwords=stopwords,background_color='black',max_words=35).generate(corpus[idx])
|
284 |
+
# fig, ax = plt.subplots()
|
285 |
+
# plt.imshow(wordcloud, interpolation='bilinear')
|
286 |
+
# plt.axis("off")
|
287 |
+
# plt.show()
|
288 |
+
# st.pyplot(fig)
|
289 |
+
# st.set_option('deprecation.showPyplotGlobalUse', False)
|
290 |
+
|
291 |
+
|
292 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
293 |
+
#
|
294 |
+
# corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
|
295 |
+
|
296 |
+
|
297 |
+
# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
|
app.yaml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
service: default
|
2 |
+
runtime: custom
|
3 |
+
env: flex
|
basic.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
from datetime import datetime
|
3 |
+
from time import time
|
4 |
+
from lxml import html,etree
|
5 |
+
from reviews_final import scrape, write_in_csv
|
6 |
+
import pandas as pd
|
7 |
+
import requests,re
|
8 |
+
import os,sys
|
9 |
+
import unicodecsv as csv
|
10 |
+
import argparse
|
11 |
+
import numpy as np
|
12 |
+
import json
|
13 |
+
def clean(text):
|
14 |
+
if text:
|
15 |
+
# Removing \n \r and \t
|
16 |
+
return ' '.join(''.join(text).split()).strip()
|
17 |
+
return None
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
def parse(locality,checkin_date,checkout_date,sort):
|
23 |
+
checkIn = checkin_date.strftime("%Y/%m/%d")
|
24 |
+
checkOut = checkout_date.strftime("%Y/%m/%d")
|
25 |
+
print ("Scraper Inititated for Locality:%s"%locality)
|
26 |
+
header = {
|
27 |
+
|
28 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
|
29 |
+
}
|
30 |
+
# TA rendering the autocomplete list using this API
|
31 |
+
print ("Finding search result page URL")
|
32 |
+
geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality
|
33 |
+
api_response = requests.get(geo_url,headers=header, timeout=120).json()
|
34 |
+
#getting the TA url for th equery from the autocomplete response
|
35 |
+
url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
|
36 |
+
print ('URL found %s'%url_from_autocomplete)
|
37 |
+
geo = api_response['results'][0]['value']
|
38 |
+
#Formating date for writing to file
|
39 |
+
a=url_from_autocomplete
|
40 |
+
b=a.split("-")
|
41 |
+
s="-"
|
42 |
+
c=s.join([b[0],b[1],"oa30",b[2],b[3]])
|
43 |
+
d=s.join([b[0],b[1],"oa60",b[2],b[3]])
|
44 |
+
e=s.join([b[0],b[1],"oa90",b[2],b[3]])
|
45 |
+
f=s.join([b[0],b[1],"oa120",b[2],b[3]])
|
46 |
+
urllist = [a,c,d,e,f]
|
47 |
+
|
48 |
+
date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d")
|
49 |
+
#form data to get the hotels list from TA for the selected date
|
50 |
+
form_data = {'changeSet': 'TRAVEL_INFO',
|
51 |
+
'showSnippets': 'false',
|
52 |
+
'staydates':date,
|
53 |
+
'uguests': '2',
|
54 |
+
'sortOrder':sort
|
55 |
+
|
56 |
+
}
|
57 |
+
|
58 |
+
|
59 |
+
|
60 |
+
json_arr = []
|
61 |
+
for url_from_autocomplete in urllist:
|
62 |
+
print(url_from_autocomplete)
|
63 |
+
|
64 |
+
headers = {
|
65 |
+
'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
|
66 |
+
'Accept-Encoding': 'gzip,deflate',
|
67 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
68 |
+
'Cache-Control': 'no-cache',
|
69 |
+
'Connection': 'keep-alive',
|
70 |
+
'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
|
71 |
+
'Host': 'www.tripadvisor.com',
|
72 |
+
'Pragma': 'no-cache',
|
73 |
+
'Referer': url_from_autocomplete,
|
74 |
+
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
|
75 |
+
'X-Requested-With': 'XMLHttpRequest'
|
76 |
+
}
|
77 |
+
cookies= {"SetCurrency":"USD"}
|
78 |
+
print ("Downloading search results page")
|
79 |
+
page_response = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False)
|
80 |
+
print ("Parsing results ")
|
81 |
+
parser = html.fromstring(page_response.text)
|
82 |
+
hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]')
|
83 |
+
hotel_data = []
|
84 |
+
if not hotel_lists:
|
85 |
+
hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]')
|
86 |
+
|
87 |
+
for hotel in hotel_lists:
|
88 |
+
XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href'
|
89 |
+
XPATH_REVIEWS = './/a[@class="review_count"]//text()'
|
90 |
+
XPATH_RANK = './/div[@class="popindex"]//text()'
|
91 |
+
XPATH_RATING = './/span[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
|
92 |
+
XPATH_RATING_2 = './/a[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
|
93 |
+
XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
|
94 |
+
XPATH_HOTEL_FEATURES = './/div[contains(@casls,"common_hotel_icons_list")]//li//text()'
|
95 |
+
XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
|
96 |
+
XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()'
|
97 |
+
XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()' #<span class="dekGp Ci _R S4 H3 MD">#74 of 319 hotels in Lisbon</span><span class="dekGp Ci _R S4 H3 MD">#6 of 319 hotels in Lisbon</span>
|
98 |
+
XPATH_RATING_ORDER = './/span[contains(@class,"dekGp Ci _R S4 H3 MD")]//text()'
|
99 |
+
XPATH_OFFICIAL_DESCRIPTION = '//div[contains(text(),"Description")]/following-sibling::div//span[contains(@class,"introText")]/text()'
|
100 |
+
|
101 |
+
|
102 |
+
raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
|
103 |
+
raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS)
|
104 |
+
raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
|
105 |
+
raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
|
106 |
+
raw_rank = hotel.xpath(XPATH_RANK)
|
107 |
+
raw_rating = hotel.xpath(XPATH_RATING_2)
|
108 |
+
raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
|
109 |
+
raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
|
110 |
+
raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE)
|
111 |
+
raw_rank_order = hotel.xpath(XPATH_RATING_ORDER)
|
112 |
+
raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
|
113 |
+
|
114 |
+
url = 'http://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else None
|
115 |
+
reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0
|
116 |
+
rank = ''.join(raw_rank) if raw_rank else None
|
117 |
+
rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
|
118 |
+
name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
|
119 |
+
hotel_features = ','.join(raw_hotel_features)
|
120 |
+
#price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
|
121 |
+
price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None
|
122 |
+
rank_order = ''.join(raw_rank_order) if raw_rank_order else None
|
123 |
+
no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
|
124 |
+
booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None
|
125 |
+
official_description = clean(raw_official_description)
|
126 |
+
|
127 |
+
if no_of_deals:
|
128 |
+
no_of_deals = no_of_deals[0]
|
129 |
+
else:
|
130 |
+
no_of_deals = 0
|
131 |
+
|
132 |
+
data = {
|
133 |
+
'hotel_name':name,
|
134 |
+
'url':url,
|
135 |
+
'locality':locality,
|
136 |
+
'reviews':reviews,
|
137 |
+
'rank':rank,
|
138 |
+
'tripadvisor_rating':rating,
|
139 |
+
'checkOut':checkOut,
|
140 |
+
'checkIn':checkIn,
|
141 |
+
'hotel_features':hotel_features,
|
142 |
+
'price_per_night':price_per_night,
|
143 |
+
'no_of_deals':no_of_deals,
|
144 |
+
'booking_provider':booking_provider,
|
145 |
+
'raw_rank': rank_order,
|
146 |
+
'desc':official_description
|
147 |
+
|
148 |
+
}
|
149 |
+
|
150 |
+
|
151 |
+
if data:
|
152 |
+
print("Writing scraped data")
|
153 |
+
json_arr.append(data)
|
154 |
+
with open('data_file.json', 'w') as outfile:
|
155 |
+
json.dump(json_arr, outfile)
|
156 |
+
# hotel_data.append(data)
|
157 |
+
# all_hotel.append(data)
|
158 |
+
# #Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
|
159 |
+
# my_df=pd.DataFrame(all_hotel)
|
160 |
+
# print(my_df['hotel_name'])
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
|
166 |
+
return urllist
|
combined_paris.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
corpus_embeddings_bi_encoder.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1854af45783940daefdea27ee8e42f026faefdc4ff4a41067c6ee4ca6eb74ade
|
3 |
+
size 64918
|
corpus_embeddings_bi_encoder.pickle 2
ADDED
Binary file (64.9 kB). View file
|
|
df_combined.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
df_combined_paris.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
embeddings.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3759225896afa4282dee721d96d1d1a8085cde7ccffe29e975568a5499a36548
|
3 |
+
size 64640
|
embeddings_h_r.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:76ae4840488129cd4c6917830018219292cca514e62c69ea9e507b185d219aa7
|
3 |
+
size 4391552
|
embeddings_review.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96cee6d14a75d19eccbe9decb501dd3c5de6c1fe401d3803a82611f075a8a6a8
|
3 |
+
size 144512
|
en_core_web_sm-3.2.0-py3-none-any.whl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e02939fb7fbae6dbcc9c5a1355f5e4e02939b649a1f0846ee844ac1d479bbeb
|
3 |
+
size 13900196
|
paris-newer.py
ADDED
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
@author: Hamza Farooq
|
7 |
+
"""
|
8 |
+
|
9 |
+
import spacy
|
10 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
11 |
+
from string import punctuation
|
12 |
+
from collections import Counter
|
13 |
+
from heapq import nlargest
|
14 |
+
import os
|
15 |
+
nlp = spacy.load("en_core_web_sm")
|
16 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
17 |
+
import datetime
|
18 |
+
|
19 |
+
from spacy import displacy
|
20 |
+
import streamlit as st
|
21 |
+
import matplotlib.pyplot as plt
|
22 |
+
from wordcloud import WordCloud
|
23 |
+
from matplotlib import pyplot as plt
|
24 |
+
|
25 |
+
import nltk
|
26 |
+
from rank_bm25 import BM25Okapi
|
27 |
+
from sklearn.feature_extraction import _stop_words
|
28 |
+
import string
|
29 |
+
from tqdm.autonotebook import tqdm
|
30 |
+
import numpy as np
|
31 |
+
import pandas as pd
|
32 |
+
from sentence_transformers import SentenceTransformer
|
33 |
+
import scipy.spatial
|
34 |
+
import pickle
|
35 |
+
from sentence_transformers import SentenceTransformer, util
|
36 |
+
import torch
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
# import utils as utl
|
43 |
+
|
44 |
+
import time
|
45 |
+
import torch
|
46 |
+
import transformers
|
47 |
+
from transformers import BartTokenizer, BartForConditionalGeneration
|
48 |
+
from string import punctuation
|
49 |
+
# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
50 |
+
|
51 |
+
import numpy as np
|
52 |
+
import pandas as pd
|
53 |
+
from sentence_transformers import SentenceTransformer
|
54 |
+
import scipy.spatial
|
55 |
+
|
56 |
+
|
57 |
+
from sentence_transformers import SentenceTransformer, util
|
58 |
+
import torch
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
def main():
|
63 |
+
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
# Settings
|
68 |
+
st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
|
69 |
+
from string import punctuation
|
70 |
+
punctuation=punctuation+ '\n'
|
71 |
+
|
72 |
+
|
73 |
+
from sentence_transformers import SentenceTransformer, util
|
74 |
+
import torch
|
75 |
+
import numpy as np
|
76 |
+
import pandas as pd
|
77 |
+
from sentence_transformers import SentenceTransformer
|
78 |
+
import scipy.spatial
|
79 |
+
|
80 |
+
from sentence_transformers import SentenceTransformer, util
|
81 |
+
import torch
|
82 |
+
#import os
|
83 |
+
@st.cache(allow_output_mutation=True)
|
84 |
+
def load_model():
|
85 |
+
return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
|
86 |
+
embedder,bi_encoder,cross_encoder = load_model()
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
#original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
|
92 |
+
st.title("Parisian Hotel Finder")
|
93 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
94 |
+
|
95 |
+
st.write(
|
96 |
+
"""
|
97 |
+
- This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
|
98 |
+
- It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
|
99 |
+
"""
|
100 |
+
)
|
101 |
+
|
102 |
+
|
103 |
+
punctuation=punctuation+ '\n'
|
104 |
+
|
105 |
+
|
106 |
+
#import os
|
107 |
+
|
108 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
109 |
+
|
110 |
+
|
111 |
+
|
112 |
+
def lower_case(input_str):
|
113 |
+
input_str = input_str.lower()
|
114 |
+
return input_str
|
115 |
+
|
116 |
+
df_all = pd.read_csv('paris_clean_newer.csv')
|
117 |
+
|
118 |
+
|
119 |
+
df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
|
120 |
+
df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
|
121 |
+
df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]
|
122 |
+
|
123 |
+
import re
|
124 |
+
|
125 |
+
# df_combined = pd.read_csv('df_combined.csv')
|
126 |
+
|
127 |
+
df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
|
128 |
+
|
129 |
+
|
130 |
+
df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
|
131 |
+
df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
|
132 |
+
df_basic = df_basic.merge(df_combined_paris_summary,how='left')
|
133 |
+
df_combined_e = df_combined.merge(df_basic)
|
134 |
+
df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']
|
135 |
+
|
136 |
+
df = df_combined_e.copy()
|
137 |
+
|
138 |
+
|
139 |
+
df_sentences = df_combined_e.set_index("all_review")
|
140 |
+
|
141 |
+
df_sentences = df_sentences["Hotel"].to_dict()
|
142 |
+
df_sentences_list = list(df_sentences.keys())
|
143 |
+
|
144 |
+
|
145 |
+
|
146 |
+
import pandas as pd
|
147 |
+
from tqdm import tqdm
|
148 |
+
from sentence_transformers import SentenceTransformer, util
|
149 |
+
|
150 |
+
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
|
151 |
+
#
|
152 |
+
corpus = df_sentences_list
|
153 |
+
# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
|
154 |
+
corpus_embeddings = np.load('embeddings.npy')
|
155 |
+
|
156 |
+
bi_encoder.max_seq_length = 512 #Truncate long passages to 256 tokens
|
157 |
+
top_k = 32 #Number of passages we want to retrieve with the bi-encoder
|
158 |
+
|
159 |
+
#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
|
160 |
+
|
161 |
+
# corpus_embeddings_h = np.load('embeddings_h_r.npy')
|
162 |
+
|
163 |
+
with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
|
164 |
+
doc_embedding = pickle.load(pkl)
|
165 |
+
|
166 |
+
with open('tokenized_corpus.pickle', 'rb') as pkl:
|
167 |
+
tokenized_corpus = pickle.load(pkl)
|
168 |
+
|
169 |
+
bm25 = BM25Okapi(tokenized_corpus)
|
170 |
+
passages = corpus
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
# We lower case our text and remove stop-words from indexing
|
176 |
+
def bm25_tokenizer(text):
|
177 |
+
tokenized_doc = []
|
178 |
+
for token in text.lower().split():
|
179 |
+
token = token.strip(string.punctuation)
|
180 |
+
|
181 |
+
if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
|
182 |
+
tokenized_doc.append(token)
|
183 |
+
return tokenized_doc
|
184 |
+
|
185 |
+
|
186 |
+
def search(query):
|
187 |
+
# q = [str(userinput)]
|
188 |
+
doc = nlp(str(userinput))
|
189 |
+
|
190 |
+
ent_html = displacy.render(doc, style="ent", jupyter=False)
|
191 |
+
# Display the entity visualization in the browser:
|
192 |
+
st.markdown(ent_html, unsafe_allow_html=True)
|
193 |
+
##### BM25 search (lexical search) #####
|
194 |
+
bm25_scores = bm25.get_scores(bm25_tokenizer(query))
|
195 |
+
top_n = np.argpartition(bm25_scores, -5)[-5:]
|
196 |
+
bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
|
197 |
+
bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
|
198 |
+
|
199 |
+
bm25list = {}
|
200 |
+
st.title("Top-5 lexical search (BM25) hits")
|
201 |
+
for hit in bm25_hits[0:5]:
|
202 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
203 |
+
|
204 |
+
st.subheader(row_dict['Hotel'].values[0])
|
205 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
206 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
207 |
+
st.write('Description:')
|
208 |
+
st.expander(de.description.values[0],expanded=False)
|
209 |
+
# try:
|
210 |
+
# st.write('Summary')
|
211 |
+
# st.expander(de.summary.values[0],expanded=False)
|
212 |
+
# except:
|
213 |
+
# None
|
214 |
+
# doc = corpus[hit['corpus_id']]
|
215 |
+
# kp.get_key_phrases(doc)
|
216 |
+
|
217 |
+
bm25list[row_dict['Hotel'].values[0]] = de.description.values[0][0:200]
|
218 |
+
|
219 |
+
#### Sematic Search #####
|
220 |
+
# Encode the query using the bi-encoder and find potentially relevant passages
|
221 |
+
question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
|
222 |
+
# question_embedding = question_embedding.cuda()
|
223 |
+
hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
|
224 |
+
hits = hits[0] # Get the hits for the first query
|
225 |
+
|
226 |
+
##### Re-Ranking #####
|
227 |
+
# Now, score all retrieved passages with the cross_encoder
|
228 |
+
cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
|
229 |
+
cross_scores = cross_encoder.predict(cross_inp)
|
230 |
+
|
231 |
+
# Sort results by the cross-encoder scores
|
232 |
+
for idx in range(len(cross_scores)):
|
233 |
+
hits[idx]['cross-score'] = cross_scores[idx]
|
234 |
+
|
235 |
+
# Output of top-5 hits from bi-encoder
|
236 |
+
st.write("\n-------------------------\n")
|
237 |
+
st.title("Top-5 Bi-Encoder Retrieval hits")
|
238 |
+
hits = sorted(hits, key=lambda x: x['score'], reverse=True)
|
239 |
+
for hit in hits[0:5]:
|
240 |
+
# st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
|
241 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
242 |
+
st.subheader(row_dict['Hotel'].values[0])
|
243 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
244 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
245 |
+
st.write('Description:')
|
246 |
+
st.expander(de.description.values[0])
|
247 |
+
# try:
|
248 |
+
# st.write('Summary')
|
249 |
+
# st.expander(de.summary.values[0],expanded=False)
|
250 |
+
# except:
|
251 |
+
# None
|
252 |
+
|
253 |
+
# Output of top-5 hits from re-ranker
|
254 |
+
st.write("\n-------------------------\n")
|
255 |
+
st.title("Top-5 Cross-Encoder Re-ranker hits")
|
256 |
+
hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
|
257 |
+
for hit in hits[0:5]:
|
258 |
+
# st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
|
259 |
+
row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
|
260 |
+
st.subheader(row_dict['Hotel'].values[0])
|
261 |
+
de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
|
262 |
+
st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
|
263 |
+
st.write('Description:')
|
264 |
+
st.expander(de.description.values[0])
|
265 |
+
# try:
|
266 |
+
# st.write('Summary')
|
267 |
+
# st.expander(de.summary.values[0],expanded=False)
|
268 |
+
# except:
|
269 |
+
# None
|
270 |
+
|
271 |
+
|
272 |
+
|
273 |
+
|
274 |
+
sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
|
275 |
+
userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
|
276 |
+
da = st.date_input(
|
277 |
+
"Date Check-in",
|
278 |
+
datetime.date(2022, 10, 5))
|
279 |
+
|
280 |
+
dst = st.date_input(
|
281 |
+
"Date Check-out",
|
282 |
+
datetime.date(2022, 10, 8))
|
283 |
+
|
284 |
+
|
285 |
+
if not userinput or userinput == sampletext:
|
286 |
+
st.write("Please enter a query to get results")
|
287 |
+
else:
|
288 |
+
query = [str(userinput)]
|
289 |
+
doc = nlp(str(userinput))
|
290 |
+
search(str(userinput))
|
291 |
+
|
292 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
293 |
+
|
294 |
+
if __name__ == '__main__':
|
295 |
+
main()
|
paris.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
+
@author: Hamza Farooq
|
7 |
+
"""
|
8 |
+
|
9 |
+
import spacy
|
10 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
11 |
+
from string import punctuation
|
12 |
+
from collections import Counter
|
13 |
+
from heapq import nlargest
|
14 |
+
import os
|
15 |
+
nlp = spacy.load("en_core_web_sm")
|
16 |
+
|
17 |
+
from spacy import displacy
|
18 |
+
import streamlit as st
|
19 |
+
import matplotlib.pyplot as plt
|
20 |
+
from wordcloud import WordCloud
|
21 |
+
from matplotlib import pyplot as plt
|
22 |
+
|
23 |
+
import nltk
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
# import utils as utl
|
30 |
+
|
31 |
+
import time
|
32 |
+
import torch
|
33 |
+
import transformers
|
34 |
+
from transformers import BartTokenizer, BartForConditionalGeneration
|
35 |
+
from string import punctuation
|
36 |
+
# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
|
37 |
+
|
38 |
+
import numpy as np
|
39 |
+
import pandas as pd
|
40 |
+
from sentence_transformers import SentenceTransformer
|
41 |
+
import scipy.spatial
|
42 |
+
import pickle as pkl
|
43 |
+
from sentence_transformers import SentenceTransformer, util
|
44 |
+
import torch
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
def main():
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
# Settings
|
54 |
+
st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
|
55 |
+
from string import punctuation
|
56 |
+
punctuation=punctuation+ '\n'
|
57 |
+
|
58 |
+
# def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
|
59 |
+
#
|
60 |
+
# text = text.replace('\n','')
|
61 |
+
# text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
|
62 |
+
# summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
|
63 |
+
# summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
|
64 |
+
# return summary_txt
|
65 |
+
|
66 |
+
from sentence_transformers import SentenceTransformer, util
|
67 |
+
import torch
|
68 |
+
import numpy as np
|
69 |
+
import pandas as pd
|
70 |
+
from sentence_transformers import SentenceTransformer
|
71 |
+
import scipy.spatial
|
72 |
+
import pickle as pkl
|
73 |
+
from sentence_transformers import SentenceTransformer, util
|
74 |
+
import torch
|
75 |
+
#import os
|
76 |
+
@st.cache(allow_output_mutation=True)
|
77 |
+
def load_model():
|
78 |
+
return SentenceTransformer('all-MiniLM-L6-v2')
|
79 |
+
embedder = load_model()
|
80 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
81 |
+
|
82 |
+
# gc = geonamescache.GeonamesCache()
|
83 |
+
#
|
84 |
+
# # gets nested dictionary for countries
|
85 |
+
# countries = gc.get_countries()
|
86 |
+
#
|
87 |
+
# # gets nested dictionary for cities
|
88 |
+
# cities = gc.get_cities()
|
89 |
+
# def gen_dict_extract(var, key):
|
90 |
+
# if isinstance(var, dict):
|
91 |
+
# for k, v in var.items():
|
92 |
+
# if k == key:
|
93 |
+
# yield v
|
94 |
+
# if isinstance(v, (dict, list)):
|
95 |
+
# yield from gen_dict_extract(v, key)
|
96 |
+
# elif isinstance(var, list):
|
97 |
+
# for d in var:
|
98 |
+
# yield from gen_dict_extract(d, key)
|
99 |
+
#
|
100 |
+
# cities = [*gen_dict_extract(cities, 'name')]
|
101 |
+
# countries = [*gen_dict_extract(countries, 'name')]
|
102 |
+
#
|
103 |
+
# cities.append('New York')
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
# mask = np.array(Image.open('upvote.png'))
|
109 |
+
|
110 |
+
|
111 |
+
#original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
|
112 |
+
st.title("Parisian Hotel Finder")
|
113 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
114 |
+
|
115 |
+
st.write(
|
116 |
+
"""
|
117 |
+
- This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
|
118 |
+
- It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
|
119 |
+
"""
|
120 |
+
)
|
121 |
+
|
122 |
+
|
123 |
+
punctuation=punctuation+ '\n'
|
124 |
+
|
125 |
+
|
126 |
+
#import os
|
127 |
+
|
128 |
+
# embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
129 |
+
|
130 |
+
df_all = pd.read_csv('combined_paris.csv')
|
131 |
+
|
132 |
+
df_all = df_all[['Hotel','review']]
|
133 |
+
|
134 |
+
|
135 |
+
df_all = df_all.drop_duplicates()
|
136 |
+
df_all = df_all.reset_index(drop=True)
|
137 |
+
summary_hotel = pd.read_csv('df_combined_paris.csv')
|
138 |
+
#
|
139 |
+
# df['hotel_name'].drop_duplicates()
|
140 |
+
|
141 |
+
df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')
|
142 |
+
|
143 |
+
import re
|
144 |
+
|
145 |
+
# df_combined = pd.read_csv('df_combined.csv')
|
146 |
+
|
147 |
+
df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
|
148 |
+
|
149 |
+
def lower_case(input_str):
|
150 |
+
input_str = input_str.lower()
|
151 |
+
return input_str
|
152 |
+
|
153 |
+
df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
|
154 |
+
|
155 |
+
df = df_combined
|
156 |
+
|
157 |
+
df_sentences = df_combined.set_index("all_review")
|
158 |
+
|
159 |
+
df_sentences = df_sentences["Hotel"].to_dict()
|
160 |
+
df_sentences_list = list(df_sentences.keys())
|
161 |
+
|
162 |
+
import pandas as pd
|
163 |
+
from tqdm import tqdm
|
164 |
+
from sentence_transformers import SentenceTransformer, util
|
165 |
+
|
166 |
+
df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
|
167 |
+
#
|
168 |
+
corpus = df_sentences_list
|
169 |
+
# corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
|
170 |
+
corpus_embeddings = np.load('embeddings_review.npy')
|
171 |
+
corpus_embeddings_h = np.load('embeddings_h_r.npy')
|
172 |
+
#
|
173 |
+
# model = SentenceTransformer('all-MiniLM-L6-v2')
|
174 |
+
# paraphrases = util.paraphrase_mining(model, corpus)
|
175 |
+
|
176 |
+
#queries = ['Hotel close to Central Park',
|
177 |
+
# 'Hotel with breakfast'
|
178 |
+
# ]
|
179 |
+
|
180 |
+
|
181 |
+
# from transformers import AutoTokenizer, AutoModel
|
182 |
+
# import torch
|
183 |
+
# import torch.nn.functional as F
|
184 |
+
#
|
185 |
+
# #Mean Pooling - Take attention mask into account for correct averaging
|
186 |
+
# def mean_pooling(model_output, attention_mask):
|
187 |
+
# token_embeddings = model_output[0] #First element of model_output contains all token embeddings
|
188 |
+
# input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
189 |
+
# return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
190 |
+
#
|
191 |
+
#
|
192 |
+
# # Sentences we want sentence embeddings for
|
193 |
+
# sentences = corpus
|
194 |
+
#
|
195 |
+
# # Load model from HuggingFace Hub
|
196 |
+
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
|
197 |
+
# model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
|
198 |
+
#
|
199 |
+
# # Tokenize sentences
|
200 |
+
# encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
201 |
+
#
|
202 |
+
# # Compute token embeddings
|
203 |
+
# with torch.no_grad():
|
204 |
+
# model_output = model(**encoded_input)
|
205 |
+
#
|
206 |
+
# # Perform pooling
|
207 |
+
# sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
208 |
+
#
|
209 |
+
# # Normalize embeddings
|
210 |
+
# sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
|
211 |
+
#
|
212 |
+
# st.text("Sentence embeddings:")
|
213 |
+
# st.text(sentence_embeddings)
|
214 |
+
#
|
215 |
+
#
|
216 |
+
|
217 |
+
#corpus_embeddings = sentence_embeddings
|
218 |
+
# Query sentences
|
219 |
+
|
220 |
+
def plot_cloud(wordcloud):
|
221 |
+
# Set figure size
|
222 |
+
st.pyplot.figure(figsize=(20, 10))
|
223 |
+
# Display image
|
224 |
+
st.pyplot(wordcloud)
|
225 |
+
# No axis details
|
226 |
+
#st.pyplot.axis("off");
|
227 |
+
sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
|
228 |
+
userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
|
229 |
+
if not userinput or userinput == sampletext:
|
230 |
+
st.write("Please enter a query to get results")
|
231 |
+
else:
|
232 |
+
query = [str(userinput)]
|
233 |
+
doc = nlp(str(userinput))
|
234 |
+
# for ent in doc.ents:
|
235 |
+
# if ent.label_ == 'GPE':
|
236 |
+
# if ent.text in countries:
|
237 |
+
# st.write(f"Country : {ent.text}")
|
238 |
+
# elif ent.text in cities:
|
239 |
+
# st.write("city")
|
240 |
+
# st.write(ent.text)
|
241 |
+
# st.write(f"City : {ent.text}")
|
242 |
+
# else:
|
243 |
+
# print(f"Other GPE : {ent.text}")
|
244 |
+
# query_embeddings = embedder.encode(queries,show_progress_bar=True)
|
245 |
+
top_k = min(5, len(corpus))
|
246 |
+
|
247 |
+
query_embedding = embedder.encode(query, convert_to_tensor=True)
|
248 |
+
|
249 |
+
# We use cosine-similarity and torch.topk to find the highest 5 scores
|
250 |
+
cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
|
251 |
+
top_results = torch.topk(cos_scores, k=top_k)
|
252 |
+
|
253 |
+
# st.write("\n\n======================\n\n")
|
254 |
+
# st.write("Query:", query)
|
255 |
+
# # doc = nlp(query)
|
256 |
+
sentence_spans = list(doc.sents)
|
257 |
+
ent_html = displacy.render(doc, style="ent", jupyter=False)
|
258 |
+
# Display the entity visualization in the browser:
|
259 |
+
st.markdown(ent_html, unsafe_allow_html=True)
|
260 |
+
|
261 |
+
#displacy.render(doc, jupyter = True, style="ent")
|
262 |
+
st.write("##")
|
263 |
+
st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
|
264 |
+
st.write("\n\n======================\n\n")
|
265 |
+
|
266 |
+
for score, idx in zip(top_results[0], top_results[1]):
|
267 |
+
|
268 |
+
row_dict = df.loc[df['all_review']== corpus[idx]]
|
269 |
+
st.subheader(row_dict['Hotel'].values[0])
|
270 |
+
|
271 |
+
hotel_subset = df_all.loc[df_all['Hotel']==row_dict['Hotel'].values[0]]
|
272 |
+
hotel_sub = summary_hotel.loc[summary_hotel['Hotel']==row_dict['Hotel'].values[0]]
|
273 |
+
st.caption("Review Summary:")
|
274 |
+
st.write(hotel_sub['summary'].values[0])
|
275 |
+
st.caption("Relevancy: {:.4f}".format(score))
|
276 |
+
st.caption("Relevant reviews:")
|
277 |
+
|
278 |
+
df_sentences_h = hotel_subset.set_index("review")
|
279 |
+
|
280 |
+
df_sentences_h = df_sentences_h["Hotel"].to_dict()
|
281 |
+
df_sentences_list_h = list(df_sentences_h.keys())
|
282 |
+
|
283 |
+
|
284 |
+
|
285 |
+
df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
|
286 |
+
#
|
287 |
+
corpus_h = df_sentences_list_h
|
288 |
+
# corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
|
289 |
+
sublist = [element for i, element in enumerate(corpus_embeddings_h) if i in (df_all[df_all['Hotel'] == row_dict['Hotel'].values[0]].index.values)]
|
290 |
+
cos_scores_h = util.pytorch_cos_sim(query_embedding, sublist)[0]
|
291 |
+
top_results_h = torch.topk(cos_scores_h, k=top_k)
|
292 |
+
|
293 |
+
for score, idx in zip(top_results_h[0], top_results_h[1]):
|
294 |
+
st.write(corpus_h[idx])
|
295 |
+
|
296 |
+
|
297 |
+
if __name__ == '__main__':
|
298 |
+
main()
|
paris_clean_newer.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
query_generator.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas
|
2 |
+
streamlit==1.1.0
|
3 |
+
regex==2021.8.3
|
4 |
+
sklearn
|
5 |
+
sentence_transformers
|
6 |
+
scipy
|
7 |
+
tqdm
|
8 |
+
gensim
|
9 |
+
plotly
|
10 |
+
wordcloud
|
11 |
+
matplotlib
|
12 |
+
spacy
|
13 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
|
14 |
+
rank-bm25
|
summary.ipynb
ADDED
@@ -0,0 +1,654 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import torch\n",
|
10 |
+
"import transformers\n",
|
11 |
+
"from transformers import BartTokenizer, BartForConditionalGeneration\n",
|
12 |
+
"tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')\n",
|
13 |
+
"mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')\n",
|
14 |
+
"torch_device = 'cpu'\n"
|
15 |
+
]
|
16 |
+
},
|
17 |
+
{
|
18 |
+
"cell_type": "code",
|
19 |
+
"execution_count": 3,
|
20 |
+
"metadata": {},
|
21 |
+
"outputs": [],
|
22 |
+
"source": [
|
23 |
+
"def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):\n",
|
24 |
+
"\n",
|
25 |
+
" text = text.replace('\\n','')\n",
|
26 |
+
" text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)\n",
|
27 |
+
" summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))\n",
|
28 |
+
" summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)\n",
|
29 |
+
" return summary_txt"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 4,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [],
|
37 |
+
"source": [
|
38 |
+
"import pandas as pd\n",
|
39 |
+
"from sentence_transformers import SentenceTransformer\n",
|
40 |
+
"import scipy.spatial\n",
|
41 |
+
"import pickle as pkl\n",
|
42 |
+
"from sentence_transformers import SentenceTransformer, util\n",
|
43 |
+
"import torch\n",
|
44 |
+
"#import os\n",
|
45 |
+
"\n",
|
46 |
+
"\n",
|
47 |
+
"df = pd.read_csv('combined_paris.csv')\n",
|
48 |
+
"\n",
|
49 |
+
"\n",
|
50 |
+
"df_combined = df.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')\n",
|
51 |
+
"\n",
|
52 |
+
"import re\n",
|
53 |
+
"\n",
|
54 |
+
"df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\\s]','',x))\n",
|
55 |
+
"def lower_case(input_str):\n",
|
56 |
+
" input_str = input_str.lower()\n",
|
57 |
+
" return input_str"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "code",
|
62 |
+
"execution_count": 5,
|
63 |
+
"metadata": {},
|
64 |
+
"outputs": [],
|
65 |
+
"source": [
|
66 |
+
"df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))\n",
|
67 |
+
"\n",
|
68 |
+
"df = df_combined\n",
|
69 |
+
"\n",
|
70 |
+
"df_sentences = df_combined.set_index(\"all_review\")\n",
|
71 |
+
"\n",
|
72 |
+
"df_sentences = df_sentences[\"Hotel\"].to_dict()\n",
|
73 |
+
"df_sentences_list = list(df_sentences.keys())\n"
|
74 |
+
]
|
75 |
+
},
|
76 |
+
{
|
77 |
+
"cell_type": "code",
|
78 |
+
"execution_count": 6,
|
79 |
+
"metadata": {},
|
80 |
+
"outputs": [
|
81 |
+
{
|
82 |
+
"data": {
|
83 |
+
"text/html": [
|
84 |
+
"<div>\n",
|
85 |
+
"<style scoped>\n",
|
86 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
87 |
+
" vertical-align: middle;\n",
|
88 |
+
" }\n",
|
89 |
+
"\n",
|
90 |
+
" .dataframe tbody tr th {\n",
|
91 |
+
" vertical-align: top;\n",
|
92 |
+
" }\n",
|
93 |
+
"\n",
|
94 |
+
" .dataframe thead th {\n",
|
95 |
+
" text-align: right;\n",
|
96 |
+
" }\n",
|
97 |
+
"</style>\n",
|
98 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
99 |
+
" <thead>\n",
|
100 |
+
" <tr style=\"text-align: right;\">\n",
|
101 |
+
" <th></th>\n",
|
102 |
+
" <th>0</th>\n",
|
103 |
+
" <th>1</th>\n",
|
104 |
+
" <th>2</th>\n",
|
105 |
+
" <th>3</th>\n",
|
106 |
+
" <th>4</th>\n",
|
107 |
+
" </tr>\n",
|
108 |
+
" </thead>\n",
|
109 |
+
" <tbody>\n",
|
110 |
+
" <tr>\n",
|
111 |
+
" <th>Hotel</th>\n",
|
112 |
+
" <td>25hours Hotel Terminus Nord</td>\n",
|
113 |
+
" <td>Acacias Etoile Hotel</td>\n",
|
114 |
+
" <td>COQ Hotel Paris</td>\n",
|
115 |
+
" <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
|
116 |
+
" <td>Cler Hotel</td>\n",
|
117 |
+
" </tr>\n",
|
118 |
+
" <tr>\n",
|
119 |
+
" <th>all_review</th>\n",
|
120 |
+
" <td>weve spent lots of time in paris and this was ...</td>\n",
|
121 |
+
" <td>the hotel is great for value the breakfast sel...</td>\n",
|
122 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
123 |
+
" <td>room was very clean transportation is very ne...</td>\n",
|
124 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
125 |
+
" </tr>\n",
|
126 |
+
" </tbody>\n",
|
127 |
+
"</table>\n",
|
128 |
+
"</div>"
|
129 |
+
],
|
130 |
+
"text/plain": [
|
131 |
+
" 0 \\\n",
|
132 |
+
"Hotel 25hours Hotel Terminus Nord \n",
|
133 |
+
"all_review weve spent lots of time in paris and this was ... \n",
|
134 |
+
"\n",
|
135 |
+
" 1 \\\n",
|
136 |
+
"Hotel Acacias Etoile Hotel \n",
|
137 |
+
"all_review the hotel is great for value the breakfast sel... \n",
|
138 |
+
"\n",
|
139 |
+
" 2 \\\n",
|
140 |
+
"Hotel COQ Hotel Paris \n",
|
141 |
+
"all_review stayed for a short city break the hotel is a ... \n",
|
142 |
+
"\n",
|
143 |
+
" 3 \\\n",
|
144 |
+
"Hotel Campanile Paris 14 - Maine Montparnasse \n",
|
145 |
+
"all_review room was very clean transportation is very ne... \n",
|
146 |
+
"\n",
|
147 |
+
" 4 \n",
|
148 |
+
"Hotel Cler Hotel \n",
|
149 |
+
"all_review we had the best stay at cler hotel the locati... "
|
150 |
+
]
|
151 |
+
},
|
152 |
+
"execution_count": 6,
|
153 |
+
"metadata": {},
|
154 |
+
"output_type": "execute_result"
|
155 |
+
}
|
156 |
+
],
|
157 |
+
"source": [
|
158 |
+
"df_combined.head().T"
|
159 |
+
]
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"cell_type": "code",
|
163 |
+
"execution_count": 7,
|
164 |
+
"metadata": {},
|
165 |
+
"outputs": [
|
166 |
+
{
|
167 |
+
"name": "stderr",
|
168 |
+
"output_type": "stream",
|
169 |
+
"text": [
|
170 |
+
"Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
|
171 |
+
]
|
172 |
+
}
|
173 |
+
],
|
174 |
+
"source": [
|
175 |
+
"long_summary = []\n",
|
176 |
+
"\n",
|
177 |
+
"for i in range(len(df_combined)):\n",
|
178 |
+
" t = bart_summarize(df_combined['all_review'][i])\n",
|
179 |
+
" long_summary.append(t)"
|
180 |
+
]
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"cell_type": "code",
|
184 |
+
"execution_count": 8,
|
185 |
+
"metadata": {},
|
186 |
+
"outputs": [],
|
187 |
+
"source": [
|
188 |
+
"df_combined['summary'] = long_summary"
|
189 |
+
]
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"cell_type": "code",
|
193 |
+
"execution_count": 9,
|
194 |
+
"metadata": {},
|
195 |
+
"outputs": [],
|
196 |
+
"source": [
|
197 |
+
"df_combined.to_csv('df_combined_paris.csv',index=False)"
|
198 |
+
]
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"cell_type": "code",
|
202 |
+
"execution_count": 10,
|
203 |
+
"metadata": {},
|
204 |
+
"outputs": [
|
205 |
+
{
|
206 |
+
"data": {
|
207 |
+
"text/html": [
|
208 |
+
"<div>\n",
|
209 |
+
"<style scoped>\n",
|
210 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
211 |
+
" vertical-align: middle;\n",
|
212 |
+
" }\n",
|
213 |
+
"\n",
|
214 |
+
" .dataframe tbody tr th {\n",
|
215 |
+
" vertical-align: top;\n",
|
216 |
+
" }\n",
|
217 |
+
"\n",
|
218 |
+
" .dataframe thead th {\n",
|
219 |
+
" text-align: right;\n",
|
220 |
+
" }\n",
|
221 |
+
"</style>\n",
|
222 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
223 |
+
" <thead>\n",
|
224 |
+
" <tr style=\"text-align: right;\">\n",
|
225 |
+
" <th></th>\n",
|
226 |
+
" <th>Hotel</th>\n",
|
227 |
+
" <th>all_review</th>\n",
|
228 |
+
" <th>summary</th>\n",
|
229 |
+
" </tr>\n",
|
230 |
+
" </thead>\n",
|
231 |
+
" <tbody>\n",
|
232 |
+
" <tr>\n",
|
233 |
+
" <th>0</th>\n",
|
234 |
+
" <td>25hours Hotel Terminus Nord</td>\n",
|
235 |
+
" <td>weve spent lots of time in paris and this was ...</td>\n",
|
236 |
+
" <td>we were blown away by this excellent hotel we ...</td>\n",
|
237 |
+
" </tr>\n",
|
238 |
+
" <tr>\n",
|
239 |
+
" <th>1</th>\n",
|
240 |
+
" <td>Acacias Etoile Hotel</td>\n",
|
241 |
+
" <td>the hotel is great for value the breakfast sel...</td>\n",
|
242 |
+
" <td>The hotel is great for value the breakfast sel...</td>\n",
|
243 |
+
" </tr>\n",
|
244 |
+
" <tr>\n",
|
245 |
+
" <th>2</th>\n",
|
246 |
+
" <td>COQ Hotel Paris</td>\n",
|
247 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
248 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
249 |
+
" </tr>\n",
|
250 |
+
" <tr>\n",
|
251 |
+
" <th>3</th>\n",
|
252 |
+
" <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
|
253 |
+
" <td>room was very clean transportation is very ne...</td>\n",
|
254 |
+
" <td>hotel turned out to be perfect for our short ...</td>\n",
|
255 |
+
" </tr>\n",
|
256 |
+
" <tr>\n",
|
257 |
+
" <th>4</th>\n",
|
258 |
+
" <td>Cler Hotel</td>\n",
|
259 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
260 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
261 |
+
" </tr>\n",
|
262 |
+
" </tbody>\n",
|
263 |
+
"</table>\n",
|
264 |
+
"</div>"
|
265 |
+
],
|
266 |
+
"text/plain": [
|
267 |
+
" Hotel \\\n",
|
268 |
+
"0 25hours Hotel Terminus Nord \n",
|
269 |
+
"1 Acacias Etoile Hotel \n",
|
270 |
+
"2 COQ Hotel Paris \n",
|
271 |
+
"3 Campanile Paris 14 - Maine Montparnasse \n",
|
272 |
+
"4 Cler Hotel \n",
|
273 |
+
"\n",
|
274 |
+
" all_review \\\n",
|
275 |
+
"0 weve spent lots of time in paris and this was ... \n",
|
276 |
+
"1 the hotel is great for value the breakfast sel... \n",
|
277 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
278 |
+
"3 room was very clean transportation is very ne... \n",
|
279 |
+
"4 we had the best stay at cler hotel the locati... \n",
|
280 |
+
"\n",
|
281 |
+
" summary \n",
|
282 |
+
"0 we were blown away by this excellent hotel we ... \n",
|
283 |
+
"1 The hotel is great for value the breakfast sel... \n",
|
284 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
285 |
+
"3 hotel turned out to be perfect for our short ... \n",
|
286 |
+
"4 we had the best stay at cler hotel the locati... "
|
287 |
+
]
|
288 |
+
},
|
289 |
+
"execution_count": 10,
|
290 |
+
"metadata": {},
|
291 |
+
"output_type": "execute_result"
|
292 |
+
}
|
293 |
+
],
|
294 |
+
"source": [
|
295 |
+
"df_combined.head()"
|
296 |
+
]
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"cell_type": "code",
|
300 |
+
"execution_count": null,
|
301 |
+
"metadata": {},
|
302 |
+
"outputs": [],
|
303 |
+
"source": []
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"cell_type": "code",
|
307 |
+
"execution_count": 3,
|
308 |
+
"metadata": {},
|
309 |
+
"outputs": [
|
310 |
+
{
|
311 |
+
"name": "stdout",
|
312 |
+
"output_type": "stream",
|
313 |
+
"text": [
|
314 |
+
"Dockerfile df_combined.csv\n",
|
315 |
+
"Hotel New York Combined.csv en_core_web_sm-3.2.0-py3-none-any.whl\n",
|
316 |
+
"README.md query_generator.ipynb\n",
|
317 |
+
"Untitled.ipynb requirements.txt\n",
|
318 |
+
"app.py summary.ipynb\n",
|
319 |
+
"app.yaml\n"
|
320 |
+
]
|
321 |
+
}
|
322 |
+
],
|
323 |
+
"source": [
|
324 |
+
"!ls"
|
325 |
+
]
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"cell_type": "code",
|
329 |
+
"execution_count": 3,
|
330 |
+
"metadata": {},
|
331 |
+
"outputs": [
|
332 |
+
{
|
333 |
+
"name": "stderr",
|
334 |
+
"output_type": "stream",
|
335 |
+
"text": [
|
336 |
+
"/Users/aimzlicious/miniforge3/envs/tf_m1/lib/python3.8/site-packages/huggingface_hub/snapshot_download.py:6: FutureWarning: snapshot_download.py has been made private and will no longer be available from version 0.11. Please use `from huggingface_hub import snapshot_download` to import the only public function in this module. Other members of the file may be changed without a deprecation notice.\n",
|
337 |
+
" warnings.warn(\n"
|
338 |
+
]
|
339 |
+
}
|
340 |
+
],
|
341 |
+
"source": [
|
342 |
+
"import pandas as pd\n",
|
343 |
+
"from sentence_transformers import SentenceTransformer\n",
|
344 |
+
"import scipy.spatial\n",
|
345 |
+
"import pickle as pkl\n",
|
346 |
+
"from sentence_transformers import SentenceTransformer, util\n",
|
347 |
+
"import torch\n",
|
348 |
+
"df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')"
|
349 |
+
]
|
350 |
+
},
|
351 |
+
{
|
352 |
+
"cell_type": "code",
|
353 |
+
"execution_count": 4,
|
354 |
+
"metadata": {},
|
355 |
+
"outputs": [
|
356 |
+
{
|
357 |
+
"data": {
|
358 |
+
"text/html": [
|
359 |
+
"<div>\n",
|
360 |
+
"<style scoped>\n",
|
361 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
362 |
+
" vertical-align: middle;\n",
|
363 |
+
" }\n",
|
364 |
+
"\n",
|
365 |
+
" .dataframe tbody tr th {\n",
|
366 |
+
" vertical-align: top;\n",
|
367 |
+
" }\n",
|
368 |
+
"\n",
|
369 |
+
" .dataframe thead th {\n",
|
370 |
+
" text-align: right;\n",
|
371 |
+
" }\n",
|
372 |
+
"</style>\n",
|
373 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
374 |
+
" <thead>\n",
|
375 |
+
" <tr style=\"text-align: right;\">\n",
|
376 |
+
" <th></th>\n",
|
377 |
+
" <th>Hotel</th>\n",
|
378 |
+
" <th>all_review</th>\n",
|
379 |
+
" <th>summary</th>\n",
|
380 |
+
" </tr>\n",
|
381 |
+
" </thead>\n",
|
382 |
+
" <tbody>\n",
|
383 |
+
" <tr>\n",
|
384 |
+
" <th>0</th>\n",
|
385 |
+
" <td>25hours Hotel Terminus Nord</td>\n",
|
386 |
+
" <td>weve spent lots of time in paris and this was ...</td>\n",
|
387 |
+
" <td>we were blown away by this excellent hotel we ...</td>\n",
|
388 |
+
" </tr>\n",
|
389 |
+
" <tr>\n",
|
390 |
+
" <th>1</th>\n",
|
391 |
+
" <td>Acacias Etoile Hotel</td>\n",
|
392 |
+
" <td>the hotel is great for value the breakfast sel...</td>\n",
|
393 |
+
" <td>The hotel is great for value the breakfast sel...</td>\n",
|
394 |
+
" </tr>\n",
|
395 |
+
" <tr>\n",
|
396 |
+
" <th>2</th>\n",
|
397 |
+
" <td>COQ Hotel Paris</td>\n",
|
398 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
399 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
400 |
+
" </tr>\n",
|
401 |
+
" <tr>\n",
|
402 |
+
" <th>3</th>\n",
|
403 |
+
" <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
|
404 |
+
" <td>room was very clean transportation is very ne...</td>\n",
|
405 |
+
" <td>hotel turned out to be perfect for our short ...</td>\n",
|
406 |
+
" </tr>\n",
|
407 |
+
" <tr>\n",
|
408 |
+
" <th>4</th>\n",
|
409 |
+
" <td>Cler Hotel</td>\n",
|
410 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
411 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
412 |
+
" </tr>\n",
|
413 |
+
" </tbody>\n",
|
414 |
+
"</table>\n",
|
415 |
+
"</div>"
|
416 |
+
],
|
417 |
+
"text/plain": [
|
418 |
+
" Hotel \\\n",
|
419 |
+
"0 25hours Hotel Terminus Nord \n",
|
420 |
+
"1 Acacias Etoile Hotel \n",
|
421 |
+
"2 COQ Hotel Paris \n",
|
422 |
+
"3 Campanile Paris 14 - Maine Montparnasse \n",
|
423 |
+
"4 Cler Hotel \n",
|
424 |
+
"\n",
|
425 |
+
" all_review \\\n",
|
426 |
+
"0 weve spent lots of time in paris and this was ... \n",
|
427 |
+
"1 the hotel is great for value the breakfast sel... \n",
|
428 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
429 |
+
"3 room was very clean transportation is very ne... \n",
|
430 |
+
"4 we had the best stay at cler hotel the locati... \n",
|
431 |
+
"\n",
|
432 |
+
" summary \n",
|
433 |
+
"0 we were blown away by this excellent hotel we ... \n",
|
434 |
+
"1 The hotel is great for value the breakfast sel... \n",
|
435 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
436 |
+
"3 hotel turned out to be perfect for our short ... \n",
|
437 |
+
"4 we had the best stay at cler hotel the locati... "
|
438 |
+
]
|
439 |
+
},
|
440 |
+
"execution_count": 4,
|
441 |
+
"metadata": {},
|
442 |
+
"output_type": "execute_result"
|
443 |
+
}
|
444 |
+
],
|
445 |
+
"source": [
|
446 |
+
"df_combined_paris.head()"
|
447 |
+
]
|
448 |
+
},
|
449 |
+
{
|
450 |
+
"cell_type": "code",
|
451 |
+
"execution_count": 5,
|
452 |
+
"metadata": {},
|
453 |
+
"outputs": [],
|
454 |
+
"source": [
|
455 |
+
"df_paris = pd.read_csv('paris_clean_newer.csv')"
|
456 |
+
]
|
457 |
+
},
|
458 |
+
{
|
459 |
+
"cell_type": "code",
|
460 |
+
"execution_count": 9,
|
461 |
+
"metadata": {},
|
462 |
+
"outputs": [],
|
463 |
+
"source": [
|
464 |
+
"hotel=pd.DataFrame(df_paris['Hotel'].drop_duplicates())"
|
465 |
+
]
|
466 |
+
},
|
467 |
+
{
|
468 |
+
"cell_type": "code",
|
469 |
+
"execution_count": 11,
|
470 |
+
"metadata": {},
|
471 |
+
"outputs": [
|
472 |
+
{
|
473 |
+
"data": {
|
474 |
+
"text/html": [
|
475 |
+
"<div>\n",
|
476 |
+
"<style scoped>\n",
|
477 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
478 |
+
" vertical-align: middle;\n",
|
479 |
+
" }\n",
|
480 |
+
"\n",
|
481 |
+
" .dataframe tbody tr th {\n",
|
482 |
+
" vertical-align: top;\n",
|
483 |
+
" }\n",
|
484 |
+
"\n",
|
485 |
+
" .dataframe thead th {\n",
|
486 |
+
" text-align: right;\n",
|
487 |
+
" }\n",
|
488 |
+
"</style>\n",
|
489 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
490 |
+
" <thead>\n",
|
491 |
+
" <tr style=\"text-align: right;\">\n",
|
492 |
+
" <th></th>\n",
|
493 |
+
" <th>Hotel</th>\n",
|
494 |
+
" <th>all_review</th>\n",
|
495 |
+
" <th>summary</th>\n",
|
496 |
+
" </tr>\n",
|
497 |
+
" </thead>\n",
|
498 |
+
" <tbody>\n",
|
499 |
+
" <tr>\n",
|
500 |
+
" <th>0</th>\n",
|
501 |
+
" <td>25hours Hotel Terminus Nord</td>\n",
|
502 |
+
" <td>weve spent lots of time in paris and this was ...</td>\n",
|
503 |
+
" <td>we were blown away by this excellent hotel we ...</td>\n",
|
504 |
+
" </tr>\n",
|
505 |
+
" <tr>\n",
|
506 |
+
" <th>1</th>\n",
|
507 |
+
" <td>Acacias Etoile Hotel</td>\n",
|
508 |
+
" <td>the hotel is great for value the breakfast sel...</td>\n",
|
509 |
+
" <td>The hotel is great for value the breakfast sel...</td>\n",
|
510 |
+
" </tr>\n",
|
511 |
+
" <tr>\n",
|
512 |
+
" <th>2</th>\n",
|
513 |
+
" <td>COQ Hotel Paris</td>\n",
|
514 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
515 |
+
" <td>stayed for a short city break the hotel is a ...</td>\n",
|
516 |
+
" </tr>\n",
|
517 |
+
" <tr>\n",
|
518 |
+
" <th>3</th>\n",
|
519 |
+
" <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
|
520 |
+
" <td>room was very clean transportation is very ne...</td>\n",
|
521 |
+
" <td>hotel turned out to be perfect for our short ...</td>\n",
|
522 |
+
" </tr>\n",
|
523 |
+
" <tr>\n",
|
524 |
+
" <th>4</th>\n",
|
525 |
+
" <td>Cler Hotel</td>\n",
|
526 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
527 |
+
" <td>we had the best stay at cler hotel the locati...</td>\n",
|
528 |
+
" </tr>\n",
|
529 |
+
" <tr>\n",
|
530 |
+
" <th>...</th>\n",
|
531 |
+
" <td>...</td>\n",
|
532 |
+
" <td>...</td>\n",
|
533 |
+
" <td>...</td>\n",
|
534 |
+
" </tr>\n",
|
535 |
+
" <tr>\n",
|
536 |
+
" <th>89</th>\n",
|
537 |
+
" <td>Sofitel Paris Le Faubourg</td>\n",
|
538 |
+
" <td>4 years ago i was the last time at sofitel le ...</td>\n",
|
539 |
+
" <td>4 years ago i was the last time at sofitel le ...</td>\n",
|
540 |
+
" </tr>\n",
|
541 |
+
" <tr>\n",
|
542 |
+
" <th>90</th>\n",
|
543 |
+
" <td>St Christopher's Gare du Nord Paris</td>\n",
|
544 |
+
" <td>when arriving to the area it felt a little dan...</td>\n",
|
545 |
+
" <td>Barry is the best bartender in paris cheers gr...</td>\n",
|
546 |
+
" </tr>\n",
|
547 |
+
" <tr>\n",
|
548 |
+
" <th>91</th>\n",
|
549 |
+
" <td>St Christopher's Inn Canal Paris</td>\n",
|
550 |
+
" <td>ive stayed at st christopher inn canal in pari...</td>\n",
|
551 |
+
" <td>ive stayed at st christopher inn canal in pari...</td>\n",
|
552 |
+
" </tr>\n",
|
553 |
+
" <tr>\n",
|
554 |
+
" <th>92</th>\n",
|
555 |
+
" <td>Touring Hotel</td>\n",
|
556 |
+
" <td>hotel is in a great location minutes walk fro...</td>\n",
|
557 |
+
" <td>Hotel is in a great location minutes walk fro...</td>\n",
|
558 |
+
" </tr>\n",
|
559 |
+
" <tr>\n",
|
560 |
+
" <th>93</th>\n",
|
561 |
+
" <td>Warwick Paris</td>\n",
|
562 |
+
" <td>if i know of anybody heading to paris i will r...</td>\n",
|
563 |
+
" <td>warwick hotel in paris is a good hotel to stay...</td>\n",
|
564 |
+
" </tr>\n",
|
565 |
+
" </tbody>\n",
|
566 |
+
"</table>\n",
|
567 |
+
"<p>94 rows × 3 columns</p>\n",
|
568 |
+
"</div>"
|
569 |
+
],
|
570 |
+
"text/plain": [
|
571 |
+
" Hotel \\\n",
|
572 |
+
"0 25hours Hotel Terminus Nord \n",
|
573 |
+
"1 Acacias Etoile Hotel \n",
|
574 |
+
"2 COQ Hotel Paris \n",
|
575 |
+
"3 Campanile Paris 14 - Maine Montparnasse \n",
|
576 |
+
"4 Cler Hotel \n",
|
577 |
+
".. ... \n",
|
578 |
+
"89 Sofitel Paris Le Faubourg \n",
|
579 |
+
"90 St Christopher's Gare du Nord Paris \n",
|
580 |
+
"91 St Christopher's Inn Canal Paris \n",
|
581 |
+
"92 Touring Hotel \n",
|
582 |
+
"93 Warwick Paris \n",
|
583 |
+
"\n",
|
584 |
+
" all_review \\\n",
|
585 |
+
"0 weve spent lots of time in paris and this was ... \n",
|
586 |
+
"1 the hotel is great for value the breakfast sel... \n",
|
587 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
588 |
+
"3 room was very clean transportation is very ne... \n",
|
589 |
+
"4 we had the best stay at cler hotel the locati... \n",
|
590 |
+
".. ... \n",
|
591 |
+
"89 4 years ago i was the last time at sofitel le ... \n",
|
592 |
+
"90 when arriving to the area it felt a little dan... \n",
|
593 |
+
"91 ive stayed at st christopher inn canal in pari... \n",
|
594 |
+
"92 hotel is in a great location minutes walk fro... \n",
|
595 |
+
"93 if i know of anybody heading to paris i will r... \n",
|
596 |
+
"\n",
|
597 |
+
" summary \n",
|
598 |
+
"0 we were blown away by this excellent hotel we ... \n",
|
599 |
+
"1 The hotel is great for value the breakfast sel... \n",
|
600 |
+
"2 stayed for a short city break the hotel is a ... \n",
|
601 |
+
"3 hotel turned out to be perfect for our short ... \n",
|
602 |
+
"4 we had the best stay at cler hotel the locati... \n",
|
603 |
+
".. ... \n",
|
604 |
+
"89 4 years ago i was the last time at sofitel le ... \n",
|
605 |
+
"90 Barry is the best bartender in paris cheers gr... \n",
|
606 |
+
"91 ive stayed at st christopher inn canal in pari... \n",
|
607 |
+
"92 Hotel is in a great location minutes walk fro... \n",
|
608 |
+
"93 warwick hotel in paris is a good hotel to stay... \n",
|
609 |
+
"\n",
|
610 |
+
"[94 rows x 3 columns]"
|
611 |
+
]
|
612 |
+
},
|
613 |
+
"execution_count": 11,
|
614 |
+
"metadata": {},
|
615 |
+
"output_type": "execute_result"
|
616 |
+
}
|
617 |
+
],
|
618 |
+
"source": [
|
619 |
+
"df_combined_paris.merge(hotel,how='left')"
|
620 |
+
]
|
621 |
+
},
|
622 |
+
{
|
623 |
+
"cell_type": "code",
|
624 |
+
"execution_count": null,
|
625 |
+
"metadata": {},
|
626 |
+
"outputs": [],
|
627 |
+
"source": []
|
628 |
+
}
|
629 |
+
],
|
630 |
+
"metadata": {
|
631 |
+
"interpreter": {
|
632 |
+
"hash": "4bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676"
|
633 |
+
},
|
634 |
+
"kernelspec": {
|
635 |
+
"display_name": "Python 3 (ipykernel)",
|
636 |
+
"language": "python",
|
637 |
+
"name": "python3"
|
638 |
+
},
|
639 |
+
"language_info": {
|
640 |
+
"codemirror_mode": {
|
641 |
+
"name": "ipython",
|
642 |
+
"version": 3
|
643 |
+
},
|
644 |
+
"file_extension": ".py",
|
645 |
+
"mimetype": "text/x-python",
|
646 |
+
"name": "python",
|
647 |
+
"nbconvert_exporter": "python",
|
648 |
+
"pygments_lexer": "ipython3",
|
649 |
+
"version": "3.8.12"
|
650 |
+
}
|
651 |
+
},
|
652 |
+
"nbformat": 4,
|
653 |
+
"nbformat_minor": 4
|
654 |
+
}
|
tokenized_corpus.pickle
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e99b20be01f7889248d5b3f667df8947ae6ca676f3a525717305e5124c8b739e
|
3 |
+
size 1261235
|