Spaces:

pryanshusharma
/

PrmScrp

Sleeping

PrmScrp / degruyterscrapper.py

H4CK3R-5M4CK3R

Scrpr

57273d8 7 months ago

2.14 kB

	import requests
	from bs4 import BeautifulSoup

	def get_headers(data: str) -> dict:
	"""This funciton helps to get the headers form the string to the dict

	Args:
	data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)

	Returns:
	dict: Return the dict or you can say header
	"""
	data = data.strip()
	data = data.split("\n")
	out = {}
	for dt in data:
	key = dt.split(":", 1)[0].strip()
	value = dt.split(":", 1)[1].strip()

	if value.lower() == "none":
	value = None
	elif value.lower() == "true":
	value = True
	elif value.lower() == "false":
	value = False

	out[key] = value
	return out

	def getLinks(url: str) -> list:
	browser = requests.session()
	# url = f"https://www.degruyter.com/journal/key/fca/{volume}/{issue}/html"
	data = browser.get(url)
	fullPage = BeautifulSoup(data.text, "lxml")
	links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"})
	output = []
	for link in links.findAll("div", {"class" : "text-container"}):
	link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href")
	output.append(f"https://www.degruyter.com{link}")
	return output

	def get_author_details(url: str) -> list:
	browser = requests.session()
	data = browser.get(url)
	authors = BeautifulSoup(data.text, "lxml")
	authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"})
	output = []
	for author in authors.findAll("span", {"class" : "contributor"}):
	author_name = author.text.strip()
	author_address = author.find("contributor-popdown").get("affiliations").strip()
	email = author.find("contributor-popdown").get("email").strip()
	if len(email.strip()) < 1:
	continue
	output.append(
	{
	"Name" : author_name,
	"Email" : email,
	"Address" : author_address
	}
	)
	return output