Spaces:

pryanshusharma
/

PrmScrp

Sleeping

PrmScrp / wileyscrapper.py

H4CK3R-5M4CK3R

added docs

c83cceb 7 months ago

2.59 kB

	import requests
	from bs4 import BeautifulSoup
	from sheets import ExcelAutomator
	from seleliumdriver import WebScraper

	def save(data:str):
	with open("data.html", "w") as op:
	op.write(str(data))

	def get_headers(data: str) -> dict:
	"""This funciton helps to get the headers form the string to the dict

	Args:
	data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)

	Returns:
	dict: Return the dict or you can say header
	"""
	data = data.strip()
	data = data.split("\n")
	out = {}
	for dt in data:
	key = dt.split(":", 1)[0].strip()
	value = dt.split(":", 1)[1].strip()

	if value.lower() == "none":
	value = None
	elif value.lower() == "true":
	value = True
	elif value.lower() == "false":
	value = False

	out[key] = value
	return out

	def get_links(url: str) -> list:
	browser = WebScraper(browser="firefox", hidden=False)
	browser.get(url) #browser.get(url, headers=get_headers(headers))
	fullPage = BeautifulSoup(browser.get_html(), "lxml")
	save(browser.get_html())
	issuelinks = []
	for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
	issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')
	return issuelinks

	def decode_email(encoded_str):
	key = int(encoded_str[:2], 16)
	encoded_bytes = bytes.fromhex(encoded_str[2:])
	decoded_email = ''.join(chr(byte ^ key) for byte in encoded_bytes)
	return decoded_email

	def get_details(url: str):
	driver = WebScraper(browser="firefox", hidden=False)
	driver.get(url)
	data = driver.get_html()
	# save(data.text)
	full_page = BeautifulSoup(data, "lxml")
	author_detail = full_page.find("div", {"class" : "accordion-tabbed"})
	output = []
	save(full_page)
	for author in author_detail.findAll("span", {"class" : "accordion-tabbed__tab-mobile accordion__closed"}):
	author_name = author.find("p", {"class" : "author-name"}).text.strip()
	if author.find("span", {"class" : "__cf_email__"}) == None:
	continue
	email = decode_email(author.find("span", {"class" : "__cf_email__"}).get("data-cfemail"))
	address = author.find("p", {"class" : "author-name"}).findNext("p").text.strip()
	output.append(
	{
	"Name" : author_name,
	"Email" : email,
	'Address' : address
	}
	)

	return output