Spaces:

pryanshusharma
/

PrmScrp

Running

App Files Files Community

PrmScrp / degruyterscrapper.py

pryanshusharma

Update degruyterscrapper.py

a73bd7d verified 6 months ago

raw

history blame contribute delete

2.48 kB

	import requests
	from bs4 import BeautifulSoup

	def get_headers(data: str) -> dict:
	"""This funciton helps to get the headers form the string to the dict

	Args:
	data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)

	Returns:
	dict: Return the dict or you can say header
	"""
	data = data.strip()
	data = data.split("\n")
	out = {}
	for dt in data:
	key = dt.split(":", 1)[0].strip()
	value = dt.split(":", 1)[1].strip()

	if value.lower() == "none":
	value = None
	elif value.lower() == "true":
	value = True
	elif value.lower() == "false":
	value = False

	out[key] = value
	return out

	def getLinks(url: str) -> list:
	browser = requests.session()
	data = browser.get(url)
	fullPage = BeautifulSoup(data.text, "lxml")
	try:
	links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"})
	output = []
	for link in links.findAll("div", {"class" : "text-container"}):
	link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href")
	output.append(f"https://www.degruyter.com{link}")
	except:
	links = fullPage.findAll("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"})
	if len(links) < 1:
	raise AttributeError("Not found")
	for link in links:
	output.append(f"https://www.degruyter.com{link.get('href')}")
	return output

	def get_author_details(url: str) -> list:
	browser = requests.session()
	data = browser.get(url)
	authors = BeautifulSoup(data.text, "lxml")
	authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"})
	output = []
	for author in authors.findAll("span", {"class" : "contributor"}):
	author_name = author.text.strip()
	author_address = author.find("contributor-popdown").get("affiliations").strip()
	# if ";" in author_address:
	# author_address = author_address.split(";")[0]
	email = author.find("contributor-popdown").get("email").strip()
	if len(email.strip()) < 1:
	continue
	output.append(
	{
	"Name" : author_name,
	"Email" : email,
	"Address" : author_address
	}
	)
	return output