PrmScrp / wileyscrapper.py
H4CK3R-5M4CK3R
added docs
c83cceb
raw
history blame
2.59 kB
import requests
from bs4 import BeautifulSoup
from sheets import ExcelAutomator
from seleliumdriver import WebScraper
def save(data:str):
with open("data.html", "w") as op:
op.write(str(data))
def get_headers(data: str) -> dict:
"""This funciton helps to get the headers form the string to the dict
Args:
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
Returns:
dict: Return the dict or you can say header
"""
data = data.strip()
data = data.split("\n")
out = {}
for dt in data:
key = dt.split(":", 1)[0].strip()
value = dt.split(":", 1)[1].strip()
if value.lower() == "none":
value = None
elif value.lower() == "true":
value = True
elif value.lower() == "false":
value = False
out[key] = value
return out
def get_links(url: str) -> list:
browser = WebScraper(browser="firefox", hidden=False)
browser.get(url) #browser.get(url, headers=get_headers(headers))
fullPage = BeautifulSoup(browser.get_html(), "lxml")
save(browser.get_html())
issuelinks = []
for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')
return issuelinks
def decode_email(encoded_str):
key = int(encoded_str[:2], 16)
encoded_bytes = bytes.fromhex(encoded_str[2:])
decoded_email = ''.join(chr(byte ^ key) for byte in encoded_bytes)
return decoded_email
def get_details(url: str):
driver = WebScraper(browser="firefox", hidden=False)
driver.get(url)
data = driver.get_html()
# save(data.text)
full_page = BeautifulSoup(data, "lxml")
author_detail = full_page.find("div", {"class" : "accordion-tabbed"})
output = []
save(full_page)
for author in author_detail.findAll("span", {"class" : "accordion-tabbed__tab-mobile accordion__closed"}):
author_name = author.find("p", {"class" : "author-name"}).text.strip()
if author.find("span", {"class" : "__cf_email__"}) == None:
continue
email = decode_email(author.find("span", {"class" : "__cf_email__"}).get("data-cfemail"))
address = author.find("p", {"class" : "author-name"}).findNext("p").text.strip()
output.append(
{
"Name" : author_name,
"Email" : email,
'Address' : address
}
)
return output