Spaces:

hamza50
/

parishotel

Runtime error

App Files Files Community

parishotel / basic.py

hamza50

Upload 23 files

93b46fc about 2 years ago

raw

history blame contribute delete

8.16 kB

	#!/usr/bin/env python
	from datetime import datetime
	from time import time
	from lxml import html,etree
	from reviews_final import scrape, write_in_csv
	import pandas as pd
	import requests,re
	import os,sys
	import unicodecsv as csv
	import argparse
	import numpy as np
	import json
	def clean(text):
	if text:
	# Removing \n \r and \t
	return ' '.join(''.join(text).split()).strip()
	return None




	def parse(locality,checkin_date,checkout_date,sort):
	checkIn = checkin_date.strftime("%Y/%m/%d")
	checkOut = checkout_date.strftime("%Y/%m/%d")
	print ("Scraper Inititated for Locality:%s"%locality)
	header = {

	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
	}
	# TA rendering the autocomplete list using this API
	print ("Finding search result page URL")
	geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality
	api_response = requests.get(geo_url,headers=header, timeout=120).json()
	#getting the TA url for th equery from the autocomplete response
	url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
	print ('URL found %s'%url_from_autocomplete)
	geo = api_response['results'][0]['value']
	#Formating date for writing to file
	a=url_from_autocomplete
	b=a.split("-")
	s="-"
	c=s.join([b[0],b[1],"oa30",b[2],b[3]])
	d=s.join([b[0],b[1],"oa60",b[2],b[3]])
	e=s.join([b[0],b[1],"oa90",b[2],b[3]])
	f=s.join([b[0],b[1],"oa120",b[2],b[3]])
	urllist = [a,c,d,e,f]

	date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d")
	#form data to get the hotels list from TA for the selected date
	form_data = {'changeSet': 'TRAVEL_INFO',
	'showSnippets': 'false',
	'staydates':date,
	'uguests': '2',
	'sortOrder':sort

	}



	json_arr = []
	for url_from_autocomplete in urllist:
	print(url_from_autocomplete)

	headers = {
	'Accept': 'text/javascript, text/html, application/xml, text/xml, /',
	'Accept-Encoding': 'gzip,deflate',
	'Accept-Language': 'en-US,en;q=0.5',
	'Cache-Control': 'no-cache',
	'Connection': 'keep-alive',
	'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
	'Host': 'www.tripadvisor.com',
	'Pragma': 'no-cache',
	'Referer': url_from_autocomplete,
	'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
	'X-Requested-With': 'XMLHttpRequest'
	}
	cookies= {"SetCurrency":"USD"}
	print ("Downloading search results page")
	page_response = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False)
	print ("Parsing results ")
	parser = html.fromstring(page_response.text)
	hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]')
	hotel_data = []
	if not hotel_lists:
	hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]')

	for hotel in hotel_lists:
	XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href'
	XPATH_REVIEWS = './/a[@class="review_count"]//text()'
	XPATH_RANK = './/div[@class="popindex"]//text()'
	XPATH_RATING = './/span[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
	XPATH_RATING_2 = './/a[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
	XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
	XPATH_HOTEL_FEATURES = './/div[contains(@casls,"common_hotel_icons_list")]//li//text()'
	XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
	XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()'
	XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()' #<span class="dekGp Ci _R S4 H3 MD">#74 of 319 hotels in Lisbon</span><span class="dekGp Ci _R S4 H3 MD">#6 of 319 hotels in Lisbon</span>
	XPATH_RATING_ORDER = './/span[contains(@class,"dekGp Ci _R S4 H3 MD")]//text()'
	XPATH_OFFICIAL_DESCRIPTION = '//div[contains(text(),"Description")]/following-sibling::div//span[contains(@class,"introText")]/text()'


	raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
	raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS)
	raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
	raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
	raw_rank = hotel.xpath(XPATH_RANK)
	raw_rating = hotel.xpath(XPATH_RATING_2)
	raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
	raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
	raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE)
	raw_rank_order = hotel.xpath(XPATH_RATING_ORDER)
	raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)

	url = 'http://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else None
	reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0
	rank = ''.join(raw_rank) if raw_rank else None
	rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
	name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
	hotel_features = ','.join(raw_hotel_features)
	#price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
	price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None
	rank_order = ''.join(raw_rank_order) if raw_rank_order else None
	no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
	booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None
	official_description = clean(raw_official_description)

	if no_of_deals:
	no_of_deals = no_of_deals[0]
	else:
	no_of_deals = 0

	data = {
	'hotel_name':name,
	'url':url,
	'locality':locality,
	'reviews':reviews,
	'rank':rank,
	'tripadvisor_rating':rating,
	'checkOut':checkOut,
	'checkIn':checkIn,
	'hotel_features':hotel_features,
	'price_per_night':price_per_night,
	'no_of_deals':no_of_deals,
	'booking_provider':booking_provider,
	'raw_rank': rank_order,
	'desc':official_description

	}


	if data:
	print("Writing scraped data")
	json_arr.append(data)
	with open('data_file.json', 'w') as outfile:
	json.dump(json_arr, outfile)
	# hotel_data.append(data)
	# all_hotel.append(data)
	# #Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
	# my_df=pd.DataFrame(all_hotel)
	# print(my_df['hotel_name'])





	return urllist