Spaces:
Sleeping
Sleeping
File size: 7,122 Bytes
57273d8 7bed2d8 57273d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 |
import os
from requests import session
from bs4 import BeautifulSoup
import base64
import urllib.parse
import traceback
import json
from sheets import ExcelAutomator
req = session()
def get_headers(data: str) -> dict:
data = data.strip()
data = data.split("\n")
out = {}
for dt in data:
key = dt.split(":", 1)[0].strip()
value = dt.split(":", 1)[1].strip()
if value.lower() == "none":
value = None
elif value.lower() == "true":
value = True
elif value.lower() == "false":
value = False
out[key] = value
return out
def get_email_from_encoding(encoded_str):
try:
base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
url_decoded = urllib.parse.unquote(base64_decoded)
decoded_json = json.loads(url_decoded)
try:
if decoded_json["#name"] == 'e-address':
if decoded_json['$']['type'] == 'email':
if 'href' in decoded_json['$']:
if 'mailto:' in decoded_json['$']['href']:
return decoded_json['$']['href'].replace("mailto:", "")
else:
return None
else:
return decoded_json['_']
else:
return None
else:
return None
except Exception as e:
with open("jsondata.json", "w") as op:
json.dump(decoded_json, op)
print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
exit()
except:
return None
def run(url: str, last_artical_name: str=None) -> tuple:
"""This function helps to get the detail from the first site
Args:
volume (int): Pass the volume number
issue (int): Pass the issue number
Returns:
tuple : It includes auth data and page title
"""
headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: none
Sec-Fetch-User: ?1
Sec-GPC: 1
"""
headers = get_headers(headers)
# url = f"https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/{volume}/suppl/C"
data = req.get(url, headers=headers)
artical_links = []
fullpage = BeautifulSoup(str(data.text), "lxml")
if fullpage.title.string.strip() == last_artical_name:
return None, fullpage.title.string.strip()
for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
artical_links.append("https://www.sciencedirect.com" + link.get("href"))
print(f"Total artical found : {len(artical_links)}")
n = 1
auth = []
print(f"Getting all artical from - {fullpage.title.string}")
for li in artical_links:
print(f"Fetching data of {n} artical")
authors = stage_two(li)
auth.extend(authors)
n += 1
return auth, fullpage.title.string.strip()
def stage_two(url: str) -> list:
headers = """
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: none
Sec-Fetch-User: ?1
Sec-GPC: 1
"""
headers = get_headers(headers)
data = req.get(url, headers=headers)
page = BeautifulSoup(data.text, "lxml")
json_data = page.find("script", {"type" : "application/json"})
json_data = json.loads(json_data.text.strip())
authors_detail = []
address = json_data['authors']['affiliations']
n = 1
if len(json_data['authors']['content']) < 1:
return authors_detail
if not '$$' in json_data['authors']['content'][0]:
with open("jsondata.json", "w") as op:
json.dump(json_data, op, indent=4)
print("ERROR Check jsondata file")
exit()
address = "Not Found"
addr = []
authr = []
for author in json_data['authors']['content'][0]['$$']:
if author['#name'] == 'author':
# Its author data
author_name = " "
for au in author['$$']:
if au['#name'] == 'given-name' or au['#name'] == 'name':
author_name = au['_'] + author_name
if au['#name'] == 'surname':
author_name = f"{author_name}{au['_']}"
if au['#name'] == 'encoded-e-address':
email = get_email_from_encoding(au['__encoded'])
if email:
authr.append(
{
'Name' : author_name,
'Email' : email
}
)
if author['#name'] == 'affiliation':
for cor in author['$$']:
if '_' in cor:
if address == "Not Found":
address = cor['_']
else:
address = f"{address} {cor['_']}"
addr.append(address)
output = []
for aut in authr:
try:
address = addr[authr.index(aut)]
except:
address = "Not Found"
if address == "Not Found":
address = ""
output.append(
{
'Name' : aut['Name'],
'Email' : aut['Email'],
'Address' : address
}
)
return output
def get_author_info_specific(vol: int) -> list:
print(f"Getting detail of volume {vol}")
data, page_title = run(vol)
return data
def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
allAuthors = []
last_page_title = None
for i in range(from_vol, to_vol + 1):
print(f"Getting data of vol {i}")
try:
data, page_title = run(i, last_page_title)
if last_page_title == page_title:
print(f"All issues covered of vol {i} changing volume")
print("--------------------------------------------------------------------------")
break
else:
last_page_title = page_title
allAuthors.extend(data)
print(f"Data recieved total authors : {len(allAuthors)}")
except Exception as e:
print(f"ERROR : {traceback.format_exc()}")
print(f"All issues covered of vol {i}")
print("--------------------------------------------------------------------------")
break
return allAuthors
|