Spaces:
Sleeping
Sleeping
H4CK3R-5M4CK3R
commited on
Commit
·
57273d8
1
Parent(s):
3f5414c
Scrpr
Browse files- aiimsscrapper.py +64 -0
- amsscrapper.py +72 -0
- degruyterscrapper.py +61 -0
- docs.md +13 -0
- install.bat +2 -0
- notification.mp3 +0 -0
- out.xlsx +0 -0
- requirements.txt +7 -0
- run.bat +1 -0
- sciencedirect.py +210 -0
- sciencedirect_admaths.py +207 -0
- seleliumdriver.py +40 -0
- server.py +219 -0
- sheets.py +32 -0
- springerscrapper.py +103 -0
- wileyscrapper.py +92 -0
aiimsscrapper.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from seleliumdriver import WebScraper
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import time
|
4 |
+
import requests
|
5 |
+
|
6 |
+
def get_links(url: str):
|
7 |
+
browser = WebScraper("firefox", hidden=True)
|
8 |
+
browser.get(url)
|
9 |
+
time.sleep(5) # Important to sleep to continue using this
|
10 |
+
pagehtml = browser.get_html()
|
11 |
+
browser.close_browser()
|
12 |
+
fullPage = BeautifulSoup(pagehtml, "lxml")
|
13 |
+
articals = fullPage.find("div", {"class": "j-archive-article"})
|
14 |
+
output = []
|
15 |
+
for link in articals.findAll("a", {"class" : "tit ng-binding ng-scope"}):
|
16 |
+
output.append("https://www.aimspress.com" + link.get("href"))
|
17 |
+
if len(output) < 1:
|
18 |
+
raise ValueError("Invalid url found")
|
19 |
+
return output
|
20 |
+
|
21 |
+
def save(dt):
|
22 |
+
with open("data.html", "w") as op:
|
23 |
+
op.write(str(dt))
|
24 |
+
print("Done saved")
|
25 |
+
|
26 |
+
def get_author_details(url: str):
|
27 |
+
browser = requests.session()
|
28 |
+
data = browser.get(url)
|
29 |
+
fullPage = BeautifulSoup(data.text, "lxml")
|
30 |
+
authors = fullPage.find("ul", {"class" : "article-author clear"})
|
31 |
+
output = []
|
32 |
+
author_about = fullPage.find("ul", {"class" : "about-author"})
|
33 |
+
authors_about = [d.text.strip() for d in author_about.findAll("div", {"class" : "lostOf"})]
|
34 |
+
for author in authors.findAll("li"):
|
35 |
+
author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
|
36 |
+
mail = author.find("a", {"class" : "com-mail"})
|
37 |
+
if mail:
|
38 |
+
mail = mail.get("href").split(":", 1)[1].strip()
|
39 |
+
else:
|
40 |
+
continue
|
41 |
+
try:
|
42 |
+
author_value_tag = author.find("a", {"class" : "com-num"}).get("data-tagval")
|
43 |
+
if "," in author_value_tag:
|
44 |
+
author_value_tag = [int(da) - 1 for da in author_value_tag.split(",")]
|
45 |
+
else:
|
46 |
+
author_value_tag = [int(author_value_tag) - 1]
|
47 |
+
address = None
|
48 |
+
for a in author_value_tag:
|
49 |
+
if address:
|
50 |
+
address = f"{address} & {authors_about[a]}"
|
51 |
+
else:
|
52 |
+
address = authors_about[a]
|
53 |
+
except:
|
54 |
+
author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
|
55 |
+
mail = author.find("a", {"class" : "com-mail"}).get("title").split(":", 1)[1].strip()
|
56 |
+
address = author.find("a", {"class" : "com-user"}).get("title").split(":", 1)[1].strip()
|
57 |
+
output.append(
|
58 |
+
{
|
59 |
+
"Name" : author_name,
|
60 |
+
"Email" : mail,
|
61 |
+
"Address" : address
|
62 |
+
}
|
63 |
+
)
|
64 |
+
return output
|
amsscrapper.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from sheets import ExcelAutomator
|
4 |
+
|
5 |
+
def get_headers(data: str) -> dict:
|
6 |
+
"""This funciton helps to get the headers form the string to the dict
|
7 |
+
|
8 |
+
Args:
|
9 |
+
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
|
10 |
+
|
11 |
+
Returns:
|
12 |
+
dict: Return the dict or you can say header
|
13 |
+
"""
|
14 |
+
data = data.strip()
|
15 |
+
data = data.split("\n")
|
16 |
+
out = {}
|
17 |
+
for dt in data:
|
18 |
+
key = dt.split(":", 1)[0].strip()
|
19 |
+
value = dt.split(":", 1)[1].strip()
|
20 |
+
|
21 |
+
if value.lower() == "none":
|
22 |
+
value = None
|
23 |
+
elif value.lower() == "true":
|
24 |
+
value = True
|
25 |
+
elif value.lower() == "false":
|
26 |
+
value = False
|
27 |
+
|
28 |
+
out[key] = value
|
29 |
+
return out
|
30 |
+
|
31 |
+
def getlinks(url: str) -> list:
|
32 |
+
browser = requests.session()
|
33 |
+
# url = f"https://www.ams.org/journals/jams/{year}-{volume}-{issue}/home.html?active=allissues"
|
34 |
+
data = browser.get(url)
|
35 |
+
fullPage = BeautifulSoup(data.text, "lxml")
|
36 |
+
article = fullPage.find("article", {"class" : "contentList"})
|
37 |
+
output = []
|
38 |
+
lnk = url.split('home.html', 1)[0]
|
39 |
+
for allarticle in article.findAll("dl"):
|
40 |
+
output.append(f'{lnk}{allarticle.find("dt").find("a").get("href")}')
|
41 |
+
return output
|
42 |
+
|
43 |
+
def get_authors(url: str):
|
44 |
+
browser = requests.session()
|
45 |
+
data = browser.get(url)
|
46 |
+
fullPage = BeautifulSoup(data.text, "lxml")
|
47 |
+
details = fullPage.find("section", {"id" : "additionalinformation"})
|
48 |
+
email = None
|
49 |
+
address = None
|
50 |
+
author_name = None
|
51 |
+
output = []
|
52 |
+
for author in details.findAll("li"):
|
53 |
+
if email != None and author_name != None and address != None:
|
54 |
+
output.append(
|
55 |
+
{
|
56 |
+
"Name" : author_name,
|
57 |
+
"Email" : email,
|
58 |
+
"Address" : address
|
59 |
+
}
|
60 |
+
)
|
61 |
+
email = None
|
62 |
+
author_name = None
|
63 |
+
address = None
|
64 |
+
if author.find("strong"):
|
65 |
+
author_name = author.text
|
66 |
+
elif "Email:" in author.text:
|
67 |
+
email = author.text.split(":", 1)[1].strip()
|
68 |
+
elif "Affiliation:" in author.text:
|
69 |
+
address = author.text.split(":", 1)[1].strip()
|
70 |
+
if author_name == None:
|
71 |
+
continue
|
72 |
+
return output
|
degruyterscrapper.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
|
4 |
+
def get_headers(data: str) -> dict:
|
5 |
+
"""This funciton helps to get the headers form the string to the dict
|
6 |
+
|
7 |
+
Args:
|
8 |
+
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
|
9 |
+
|
10 |
+
Returns:
|
11 |
+
dict: Return the dict or you can say header
|
12 |
+
"""
|
13 |
+
data = data.strip()
|
14 |
+
data = data.split("\n")
|
15 |
+
out = {}
|
16 |
+
for dt in data:
|
17 |
+
key = dt.split(":", 1)[0].strip()
|
18 |
+
value = dt.split(":", 1)[1].strip()
|
19 |
+
|
20 |
+
if value.lower() == "none":
|
21 |
+
value = None
|
22 |
+
elif value.lower() == "true":
|
23 |
+
value = True
|
24 |
+
elif value.lower() == "false":
|
25 |
+
value = False
|
26 |
+
|
27 |
+
out[key] = value
|
28 |
+
return out
|
29 |
+
|
30 |
+
def getLinks(url: str) -> list:
|
31 |
+
browser = requests.session()
|
32 |
+
# url = f"https://www.degruyter.com/journal/key/fca/{volume}/{issue}/html"
|
33 |
+
data = browser.get(url)
|
34 |
+
fullPage = BeautifulSoup(data.text, "lxml")
|
35 |
+
links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"})
|
36 |
+
output = []
|
37 |
+
for link in links.findAll("div", {"class" : "text-container"}):
|
38 |
+
link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href")
|
39 |
+
output.append(f"https://www.degruyter.com{link}")
|
40 |
+
return output
|
41 |
+
|
42 |
+
def get_author_details(url: str) -> list:
|
43 |
+
browser = requests.session()
|
44 |
+
data = browser.get(url)
|
45 |
+
authors = BeautifulSoup(data.text, "lxml")
|
46 |
+
authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"})
|
47 |
+
output = []
|
48 |
+
for author in authors.findAll("span", {"class" : "contributor"}):
|
49 |
+
author_name = author.text.strip()
|
50 |
+
author_address = author.find("contributor-popdown").get("affiliations").strip()
|
51 |
+
email = author.find("contributor-popdown").get("email").strip()
|
52 |
+
if len(email.strip()) < 1:
|
53 |
+
continue
|
54 |
+
output.append(
|
55 |
+
{
|
56 |
+
"Name" : author_name,
|
57 |
+
"Email" : email,
|
58 |
+
"Address" : author_address
|
59 |
+
}
|
60 |
+
)
|
61 |
+
return output
|
docs.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Steps to install
|
2 |
+
|
3 |
+
- Make sure that python are installed and also pip you can use `python` command to check if python is installed or not.
|
4 |
+
- Also make sure to check if pip is installed or not by using the command `pip` if pip is not working then you can use command `python -m pip install -r requirements.txt` or if python command is not working then you can use `python3 -m pip install -r requirements.txt`.
|
5 |
+
- If `python3` is working then make sure to edit the file `run.bat` and change `python` to `python3`
|
6 |
+
- Then double click on the `install.bat` if `python` command is working fine.
|
7 |
+
- Then `pip install webdriver-manager` if this command does not work then use `python -m pip install webdriver-manager` if python command does not work then use python3 insted
|
8 |
+
- Now everything is up and running
|
9 |
+
|
10 |
+
## Steps to use
|
11 |
+
|
12 |
+
- To use the script just double click on the `run.bat` file
|
13 |
+
- Then open `http://127.0.0.1:7860` this link in the browser and you are good to go
|
install.bat
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
python -m pip install -r requirements.txt
|
2 |
+
echo "Done installed success"
|
notification.mp3
ADDED
Binary file (931 kB). View file
|
|
out.xlsx
ADDED
Binary file (105 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
bs4
|
2 |
+
pygame
|
3 |
+
gradio
|
4 |
+
openpyxl
|
5 |
+
selenium
|
6 |
+
requests
|
7 |
+
webdriver-manager
|
run.bat
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python server.py
|
sciencedirect.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from requests import session
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import base64
|
5 |
+
import urllib.parse
|
6 |
+
import traceback
|
7 |
+
import json
|
8 |
+
from sheets import ExcelAutomator
|
9 |
+
|
10 |
+
req = session()
|
11 |
+
|
12 |
+
def get_headers(data: str) -> dict:
|
13 |
+
data = data.strip()
|
14 |
+
data = data.split("\n")
|
15 |
+
out = {}
|
16 |
+
for dt in data:
|
17 |
+
key = dt.split(":", 1)[0].strip()
|
18 |
+
value = dt.split(":", 1)[1].strip()
|
19 |
+
|
20 |
+
if value.lower() == "none":
|
21 |
+
value = None
|
22 |
+
elif value.lower() == "true":
|
23 |
+
value = True
|
24 |
+
elif value.lower() == "false":
|
25 |
+
value = False
|
26 |
+
|
27 |
+
out[key] = value
|
28 |
+
return out
|
29 |
+
|
30 |
+
def get_email_from_encoding(encoded_str):
|
31 |
+
base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
|
32 |
+
url_decoded = urllib.parse.unquote(base64_decoded)
|
33 |
+
decoded_json = json.loads(url_decoded)
|
34 |
+
try:
|
35 |
+
if decoded_json["#name"] == 'e-address':
|
36 |
+
if decoded_json['$']['type'] == 'email':
|
37 |
+
if 'href' in decoded_json['$']:
|
38 |
+
if 'mailto:' in decoded_json['$']['href']:
|
39 |
+
return decoded_json['$']['href'].replace("mailto:", "")
|
40 |
+
else:
|
41 |
+
return None
|
42 |
+
else:
|
43 |
+
return decoded_json['_']
|
44 |
+
else:
|
45 |
+
return None
|
46 |
+
else:
|
47 |
+
return None
|
48 |
+
except Exception as e:
|
49 |
+
with open("jsondata.json", "w") as op:
|
50 |
+
json.dump(decoded_json, op)
|
51 |
+
print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
|
52 |
+
exit()
|
53 |
+
|
54 |
+
def run(url: str, last_artical_name: str=None) -> tuple:
|
55 |
+
"""This function helps to get the detail from the first site
|
56 |
+
|
57 |
+
Args:
|
58 |
+
volume (int): Pass the volume number
|
59 |
+
issue (int): Pass the issue number
|
60 |
+
|
61 |
+
Returns:
|
62 |
+
tuple : It includes auth data and page title
|
63 |
+
"""
|
64 |
+
headers = """
|
65 |
+
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
|
66 |
+
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
|
67 |
+
Accept-Language: en-US,en;q=0.5
|
68 |
+
Accept-Encoding: gzip, deflate, br
|
69 |
+
Connection: keep-alive
|
70 |
+
Upgrade-Insecure-Requests: 1
|
71 |
+
Sec-Fetch-Dest: document
|
72 |
+
Sec-Fetch-Mode: navigate
|
73 |
+
Sec-Fetch-Site: none
|
74 |
+
Sec-Fetch-User: ?1
|
75 |
+
Sec-GPC: 1
|
76 |
+
"""
|
77 |
+
|
78 |
+
headers = get_headers(headers)
|
79 |
+
|
80 |
+
# url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
|
81 |
+
|
82 |
+
data = req.get(url, headers=headers)
|
83 |
+
|
84 |
+
artical_links = []
|
85 |
+
fullpage = BeautifulSoup(str(data.text), "lxml")
|
86 |
+
if fullpage.title.string.strip() == last_artical_name:
|
87 |
+
return None, fullpage.title.string.strip()
|
88 |
+
for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
|
89 |
+
artical_links.append("https://www.sciencedirect.com" + link.get("href"))
|
90 |
+
print(f"Total artical found : {len(artical_links)}")
|
91 |
+
n = 1
|
92 |
+
auth = []
|
93 |
+
print(f"Getting all artical from - {fullpage.title.string}")
|
94 |
+
for li in artical_links:
|
95 |
+
print(f"Fetching data of {n} artical")
|
96 |
+
authors = stage_two(li)
|
97 |
+
auth.extend(authors)
|
98 |
+
n += 1
|
99 |
+
return auth, fullpage.title.string.strip()
|
100 |
+
|
101 |
+
def stage_two(url: str) -> list:
|
102 |
+
|
103 |
+
headers = """
|
104 |
+
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
|
105 |
+
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
|
106 |
+
Accept-Language: en-US,en;q=0.5
|
107 |
+
Accept-Encoding: gzip, deflate, br
|
108 |
+
Connection: keep-alive
|
109 |
+
Upgrade-Insecure-Requests: 1
|
110 |
+
Sec-Fetch-Dest: document
|
111 |
+
Sec-Fetch-Mode: navigate
|
112 |
+
Sec-Fetch-Site: none
|
113 |
+
Sec-Fetch-User: ?1
|
114 |
+
Sec-GPC: 1
|
115 |
+
"""
|
116 |
+
headers = get_headers(headers)
|
117 |
+
|
118 |
+
data = req.get(url, headers=headers)
|
119 |
+
page = BeautifulSoup(data.text, "lxml")
|
120 |
+
json_data = page.find("script", {"type" : "application/json"})
|
121 |
+
json_data = json.loads(json_data.text.strip())
|
122 |
+
authors_detail = []
|
123 |
+
address = json_data['authors']['affiliations']
|
124 |
+
n = 1
|
125 |
+
if len(json_data['authors']['content']) < 1:
|
126 |
+
return authors_detail
|
127 |
+
if not '$$' in json_data['authors']['content'][0]:
|
128 |
+
with open("jsondata.json", "w") as op:
|
129 |
+
json.dump(json_data, op, indent=4)
|
130 |
+
print("ERROR Check jsondata file")
|
131 |
+
exit()
|
132 |
+
address = "Not Found"
|
133 |
+
addr = []
|
134 |
+
authr = []
|
135 |
+
email = None
|
136 |
+
for author in json_data['authors']['content'][0]['$$']:
|
137 |
+
if author['#name'] == 'author':
|
138 |
+
# Its author data
|
139 |
+
author_name = " "
|
140 |
+
for au in author['$$']:
|
141 |
+
if au['#name'] == 'given-name' or au['#name'] == 'name':
|
142 |
+
author_name = au['_'] + author_name
|
143 |
+
if au['#name'] == 'surname':
|
144 |
+
author_name = f"{author_name}{au['_']}"
|
145 |
+
if au['#name'] == 'encoded-e-address':
|
146 |
+
email = get_email_from_encoding(au['__encoded'])
|
147 |
+
if email:
|
148 |
+
authr.append(
|
149 |
+
{
|
150 |
+
'Name' : author_name,
|
151 |
+
'Email' : email
|
152 |
+
}
|
153 |
+
)
|
154 |
+
else:
|
155 |
+
continue
|
156 |
+
if author['#name'] == 'affiliation':
|
157 |
+
for cor in author['$$']:
|
158 |
+
if '_' in cor:
|
159 |
+
if address == "Not Found":
|
160 |
+
address = cor['_']
|
161 |
+
else:
|
162 |
+
address = f"{address} {cor['_']}"
|
163 |
+
addr.append(address)
|
164 |
+
|
165 |
+
output = []
|
166 |
+
for aut in authr:
|
167 |
+
try:
|
168 |
+
address = addr[authr.index(aut)]
|
169 |
+
except:
|
170 |
+
address = "Not Found"
|
171 |
+
if address == "Not Found":
|
172 |
+
address = url
|
173 |
+
output.append(
|
174 |
+
{
|
175 |
+
'Name' : aut['Name'],
|
176 |
+
'Email' : aut['Email'],
|
177 |
+
'Address' : address
|
178 |
+
}
|
179 |
+
)
|
180 |
+
return output
|
181 |
+
|
182 |
+
def get_author_info_specific(vol: int, issue: int) -> list:
|
183 |
+
print(f"Getting detail of volume {vol} and issue {issue}")
|
184 |
+
data, page_title = run(vol, issue)
|
185 |
+
return data
|
186 |
+
|
187 |
+
def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
|
188 |
+
allAuthors = []
|
189 |
+
last_page_title = None
|
190 |
+
for i in range(from_vol, to_vol + 1):
|
191 |
+
print(f"Getting data of vol {i}")
|
192 |
+
d = 1
|
193 |
+
while True:
|
194 |
+
try:
|
195 |
+
data, page_title = run(i, d, last_page_title)
|
196 |
+
if last_page_title == page_title:
|
197 |
+
print(f"All issues covered of vol {i} changing volume")
|
198 |
+
print("--------------------------------------------------------------------------")
|
199 |
+
break
|
200 |
+
else:
|
201 |
+
last_page_title = page_title
|
202 |
+
allAuthors.extend(data)
|
203 |
+
print(f"Issue {d} data recieved total authors : {len(allAuthors)}")
|
204 |
+
except Exception as e:
|
205 |
+
print(f"ERROR : {traceback.format_exc()}")
|
206 |
+
print(f"All issues covered of vol {i}")
|
207 |
+
print("--------------------------------------------------------------------------")
|
208 |
+
break
|
209 |
+
d += 1
|
210 |
+
return allAuthors
|
sciencedirect_admaths.py
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from requests import session
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import base64
|
5 |
+
import urllib.parse
|
6 |
+
import traceback
|
7 |
+
import json
|
8 |
+
from sheets import ExcelAutomator
|
9 |
+
|
10 |
+
req = session()
|
11 |
+
|
12 |
+
def get_headers(data: str) -> dict:
|
13 |
+
data = data.strip()
|
14 |
+
data = data.split("\n")
|
15 |
+
out = {}
|
16 |
+
for dt in data:
|
17 |
+
key = dt.split(":", 1)[0].strip()
|
18 |
+
value = dt.split(":", 1)[1].strip()
|
19 |
+
|
20 |
+
if value.lower() == "none":
|
21 |
+
value = None
|
22 |
+
elif value.lower() == "true":
|
23 |
+
value = True
|
24 |
+
elif value.lower() == "false":
|
25 |
+
value = False
|
26 |
+
|
27 |
+
out[key] = value
|
28 |
+
return out
|
29 |
+
|
30 |
+
def get_email_from_encoding(encoded_str):
|
31 |
+
try:
|
32 |
+
base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
|
33 |
+
url_decoded = urllib.parse.unquote(base64_decoded)
|
34 |
+
decoded_json = json.loads(url_decoded)
|
35 |
+
try:
|
36 |
+
if decoded_json["#name"] == 'e-address':
|
37 |
+
if decoded_json['$']['type'] == 'email':
|
38 |
+
if 'href' in decoded_json['$']:
|
39 |
+
if 'mailto:' in decoded_json['$']['href']:
|
40 |
+
return decoded_json['$']['href'].replace("mailto:", "")
|
41 |
+
else:
|
42 |
+
return None
|
43 |
+
else:
|
44 |
+
return decoded_json['_']
|
45 |
+
else:
|
46 |
+
return None
|
47 |
+
else:
|
48 |
+
return None
|
49 |
+
except Exception as e:
|
50 |
+
with open("jsondata.json", "w") as op:
|
51 |
+
json.dump(decoded_json, op)
|
52 |
+
print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
|
53 |
+
exit()
|
54 |
+
except:
|
55 |
+
return None
|
56 |
+
|
57 |
+
def run(url: str, last_artical_name: str=None) -> tuple:
|
58 |
+
"""This function helps to get the detail from the first site
|
59 |
+
|
60 |
+
Args:
|
61 |
+
volume (int): Pass the volume number
|
62 |
+
issue (int): Pass the issue number
|
63 |
+
|
64 |
+
Returns:
|
65 |
+
tuple : It includes auth data and page title
|
66 |
+
"""
|
67 |
+
headers = """
|
68 |
+
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
|
69 |
+
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
|
70 |
+
Accept-Language: en-US,en;q=0.5
|
71 |
+
Accept-Encoding: gzip, deflate, br
|
72 |
+
Connection: keep-alive
|
73 |
+
Upgrade-Insecure-Requests: 1
|
74 |
+
Sec-Fetch-Dest: document
|
75 |
+
Sec-Fetch-Mode: navigate
|
76 |
+
Sec-Fetch-Site: none
|
77 |
+
Sec-Fetch-User: ?1
|
78 |
+
Sec-GPC: 1
|
79 |
+
"""
|
80 |
+
|
81 |
+
headers = get_headers(headers)
|
82 |
+
|
83 |
+
# url = f"https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/{volume}/suppl/C"
|
84 |
+
|
85 |
+
data = req.get(url, headers=headers)
|
86 |
+
|
87 |
+
artical_links = []
|
88 |
+
fullpage = BeautifulSoup(str(data.text), "lxml")
|
89 |
+
if fullpage.title.string.strip() == last_artical_name:
|
90 |
+
return None, fullpage.title.string.strip()
|
91 |
+
for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
|
92 |
+
artical_links.append("https://www.sciencedirect.com" + link.get("href"))
|
93 |
+
print(f"Total artical found : {len(artical_links)}")
|
94 |
+
n = 1
|
95 |
+
auth = []
|
96 |
+
print(f"Getting all artical from - {fullpage.title.string}")
|
97 |
+
for li in artical_links:
|
98 |
+
print(f"Fetching data of {n} artical")
|
99 |
+
authors = stage_two(li)
|
100 |
+
auth.extend(authors)
|
101 |
+
n += 1
|
102 |
+
return auth, fullpage.title.string.strip()
|
103 |
+
|
104 |
+
def stage_two(url: str) -> list:
|
105 |
+
|
106 |
+
headers = """
|
107 |
+
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
|
108 |
+
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
|
109 |
+
Accept-Language: en-US,en;q=0.5
|
110 |
+
Accept-Encoding: gzip, deflate, br
|
111 |
+
Connection: keep-alive
|
112 |
+
Upgrade-Insecure-Requests: 1
|
113 |
+
Sec-Fetch-Dest: document
|
114 |
+
Sec-Fetch-Mode: navigate
|
115 |
+
Sec-Fetch-Site: none
|
116 |
+
Sec-Fetch-User: ?1
|
117 |
+
Sec-GPC: 1
|
118 |
+
"""
|
119 |
+
headers = get_headers(headers)
|
120 |
+
|
121 |
+
data = req.get(url, headers=headers)
|
122 |
+
page = BeautifulSoup(data.text, "lxml")
|
123 |
+
json_data = page.find("script", {"type" : "application/json"})
|
124 |
+
json_data = json.loads(json_data.text.strip())
|
125 |
+
authors_detail = []
|
126 |
+
address = json_data['authors']['affiliations']
|
127 |
+
n = 1
|
128 |
+
if len(json_data['authors']['content']) < 1:
|
129 |
+
return authors_detail
|
130 |
+
if not '$$' in json_data['authors']['content'][0]:
|
131 |
+
with open("jsondata.json", "w") as op:
|
132 |
+
json.dump(json_data, op, indent=4)
|
133 |
+
print("ERROR Check jsondata file")
|
134 |
+
exit()
|
135 |
+
address = "Not Found"
|
136 |
+
addr = []
|
137 |
+
authr = []
|
138 |
+
for author in json_data['authors']['content'][0]['$$']:
|
139 |
+
if author['#name'] == 'author':
|
140 |
+
# Its author data
|
141 |
+
author_name = " "
|
142 |
+
for au in author['$$']:
|
143 |
+
if au['#name'] == 'given-name' or au['#name'] == 'name':
|
144 |
+
author_name = au['_'] + author_name
|
145 |
+
if au['#name'] == 'surname':
|
146 |
+
author_name = f"{author_name}{au['_']}"
|
147 |
+
if au['#name'] == 'encoded-e-address':
|
148 |
+
email = get_email_from_encoding(au['__encoded'])
|
149 |
+
if email:
|
150 |
+
authr.append(
|
151 |
+
{
|
152 |
+
'Name' : author_name,
|
153 |
+
'Email' : email
|
154 |
+
}
|
155 |
+
)
|
156 |
+
if author['#name'] == 'affiliation':
|
157 |
+
for cor in author['$$']:
|
158 |
+
if '_' in cor:
|
159 |
+
if address == "Not Found":
|
160 |
+
address = cor['_']
|
161 |
+
else:
|
162 |
+
address = f"{address} {cor['_']}"
|
163 |
+
addr.append(address)
|
164 |
+
|
165 |
+
output = []
|
166 |
+
for aut in authr:
|
167 |
+
try:
|
168 |
+
address = addr[authr.index(aut)]
|
169 |
+
except:
|
170 |
+
address = "Not Found"
|
171 |
+
if address == "Not Found":
|
172 |
+
address = url
|
173 |
+
output.append(
|
174 |
+
{
|
175 |
+
'Name' : aut['Name'],
|
176 |
+
'Email' : aut['Email'],
|
177 |
+
'Address' : address
|
178 |
+
}
|
179 |
+
)
|
180 |
+
return output
|
181 |
+
|
182 |
+
def get_author_info_specific(vol: int) -> list:
|
183 |
+
print(f"Getting detail of volume {vol}")
|
184 |
+
data, page_title = run(vol)
|
185 |
+
return data
|
186 |
+
|
187 |
+
def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
|
188 |
+
allAuthors = []
|
189 |
+
last_page_title = None
|
190 |
+
for i in range(from_vol, to_vol + 1):
|
191 |
+
print(f"Getting data of vol {i}")
|
192 |
+
try:
|
193 |
+
data, page_title = run(i, last_page_title)
|
194 |
+
if last_page_title == page_title:
|
195 |
+
print(f"All issues covered of vol {i} changing volume")
|
196 |
+
print("--------------------------------------------------------------------------")
|
197 |
+
break
|
198 |
+
else:
|
199 |
+
last_page_title = page_title
|
200 |
+
allAuthors.extend(data)
|
201 |
+
print(f"Data recieved total authors : {len(allAuthors)}")
|
202 |
+
except Exception as e:
|
203 |
+
print(f"ERROR : {traceback.format_exc()}")
|
204 |
+
print(f"All issues covered of vol {i}")
|
205 |
+
print("--------------------------------------------------------------------------")
|
206 |
+
break
|
207 |
+
return allAuthors
|
seleliumdriver.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
3 |
+
from selenium.webdriver.firefox.service import Service as FirefoxService
|
4 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
5 |
+
from webdriver_manager.firefox import GeckoDriverManager
|
6 |
+
from selenium.webdriver.common.by import By
|
7 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
8 |
+
from selenium.webdriver.support import expected_conditions as EC
|
9 |
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
10 |
+
from selenium.webdriver.firefox.options import Options as FirefoxOptions
|
11 |
+
|
12 |
+
class WebScraper:
|
13 |
+
|
14 |
+
def __init__(self, browser='chrome', hidden=True):
|
15 |
+
if browser.lower() == 'chrome':
|
16 |
+
options = ChromeOptions()
|
17 |
+
if hidden:
|
18 |
+
options.add_argument('--headless')
|
19 |
+
options.add_argument('--window-size=1920,1200')
|
20 |
+
self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
|
21 |
+
elif browser.lower() == 'firefox':
|
22 |
+
options = FirefoxOptions()
|
23 |
+
if hidden:
|
24 |
+
options.add_argument('--headless')
|
25 |
+
options.add_argument('--window-size=1920,1200')
|
26 |
+
self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)
|
27 |
+
else:
|
28 |
+
raise ValueError('Unsupported browser. Only "chrome" and "firefox" are supported.')
|
29 |
+
|
30 |
+
def get(self, url, wait_time=10):
|
31 |
+
self.driver.get(url)
|
32 |
+
WebDriverWait(self.driver, wait_time).until(
|
33 |
+
EC.presence_of_element_located((By.TAG_NAME, 'body'))
|
34 |
+
)
|
35 |
+
|
36 |
+
def get_html(self):
|
37 |
+
return self.driver.page_source
|
38 |
+
|
39 |
+
def close_browser(self):
|
40 |
+
self.driver.quit()
|
server.py
ADDED
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import aiimsscrapper
|
3 |
+
import amsscrapper
|
4 |
+
import degruyterscrapper
|
5 |
+
import sciencedirect
|
6 |
+
import sciencedirect_admaths
|
7 |
+
import springerscrapper
|
8 |
+
import wileyscrapper
|
9 |
+
from urllib.parse import urlparse
|
10 |
+
import traceback
|
11 |
+
from sheets import ExcelAutomator
|
12 |
+
import pygame
|
13 |
+
import threading
|
14 |
+
from datetime import datetime
|
15 |
+
import os
|
16 |
+
|
17 |
+
def play_sound():
|
18 |
+
pygame.mixer.init()
|
19 |
+
pygame.mixer.music.load("notification.mp3") # Ensure this file exists
|
20 |
+
pygame.mixer.music.play()
|
21 |
+
|
22 |
+
def print(data: str):
|
23 |
+
if not os.path.exists("LOGS.txt"):
|
24 |
+
with open("LOGS.txt", "w") as op:
|
25 |
+
op.write(f"{datetime.now().hour}-{datetime.now().minute}-{datetime.now().second}.{datetime.now().microsecond}/{datetime.now().day}-{datetime.now().month}-{datetime.now().year}\n------------------------------------------------------------------\n")
|
26 |
+
with open("LOGS.txt", "a") as op:
|
27 |
+
op.write(f"\n{datetime.now().hour}-{datetime.now().minute}-{datetime.now().second}.{datetime.now().microsecond}/{datetime.now().day}-{datetime.now().month}-{datetime.now().year} -> {data}")
|
28 |
+
gr.Info(data, duration=3)
|
29 |
+
|
30 |
+
def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str) -> str:
|
31 |
+
url_sch = urlparse(url)
|
32 |
+
domain = url_sch.hostname
|
33 |
+
sht = ExcelAutomator([
|
34 |
+
"Name",
|
35 |
+
"Email",
|
36 |
+
"Address"
|
37 |
+
],
|
38 |
+
output
|
39 |
+
)
|
40 |
+
filen = True
|
41 |
+
if "{" in url:
|
42 |
+
links = []
|
43 |
+
if reverse:
|
44 |
+
for vol in reversed(range(from_range, to_range)):
|
45 |
+
print(url)
|
46 |
+
links.append(url.format(v=vol, i="{i}"))
|
47 |
+
else:
|
48 |
+
for vol in range(from_range, to_range):
|
49 |
+
print(url)
|
50 |
+
links.append(url.format(v=vol, i="{i}"))
|
51 |
+
else:
|
52 |
+
links = [url]
|
53 |
+
filen = False
|
54 |
+
print(f"Total links found {len(links)}")
|
55 |
+
try:
|
56 |
+
if domain == "www.ams.org" or domain == "ams.org":
|
57 |
+
# AMS Scrapper
|
58 |
+
for ur in links:
|
59 |
+
isu = 1
|
60 |
+
while True:
|
61 |
+
if len(str(isu)) < 2:
|
62 |
+
isu = f"0{isu}"
|
63 |
+
try:
|
64 |
+
if filen:
|
65 |
+
print(f"Getting data for link {ur.format(i=isu)}")
|
66 |
+
|
67 |
+
allLinks = amsscrapper.getlinks(ur.format(i=isu))
|
68 |
+
isu += 1
|
69 |
+
else:
|
70 |
+
print(f"Getting data for link {ur}")
|
71 |
+
allLinks = amsscrapper.getlinks(ur)
|
72 |
+
except:
|
73 |
+
print("Error")
|
74 |
+
break
|
75 |
+
for link in allLinks:
|
76 |
+
authors = amsscrapper.get_authors(link)
|
77 |
+
for auth in authors:
|
78 |
+
sht.save(auth)
|
79 |
+
if filen == False: # If filen is true then dont need to start the loop
|
80 |
+
break
|
81 |
+
sht.save_to_file()
|
82 |
+
return sht.save_to_file()
|
83 |
+
elif domain == "www.degruyter.com" or domain == "degruyter.com":
|
84 |
+
# Degruyter scrapper
|
85 |
+
for ur in links:
|
86 |
+
isu = 1
|
87 |
+
while True:
|
88 |
+
try:
|
89 |
+
if filen:
|
90 |
+
print(f"Getting data for link {ur.format(i=isu)}")
|
91 |
+
allLinks = degruyterscrapper.getLinks(ur.format(i=isu))
|
92 |
+
isu += 1
|
93 |
+
else:
|
94 |
+
print(f"Getting data for link {ur}")
|
95 |
+
allLinks = degruyterscrapper.getLinks(ur)
|
96 |
+
except:
|
97 |
+
break
|
98 |
+
for link in allLinks:
|
99 |
+
authors = degruyterscrapper.get_author_details(link)
|
100 |
+
for auth in authors:
|
101 |
+
sht.save(auth)
|
102 |
+
if filen == False: # If filen is true then dont need to start the loop
|
103 |
+
break
|
104 |
+
sht.save_to_file()
|
105 |
+
return sht.save_to_file()
|
106 |
+
elif domain == "www.aimspress.com" or domain == "aimspress.com":
|
107 |
+
for ur in links:
|
108 |
+
isu = 1
|
109 |
+
while True:
|
110 |
+
try:
|
111 |
+
if filen:
|
112 |
+
print(f"Getting data for link {ur.format(i=isu)}")
|
113 |
+
allLinks = aiimsscrapper.get_links(ur.format(i=isu))
|
114 |
+
isu += 1
|
115 |
+
else:
|
116 |
+
print(f"Getting data for link {ur}")
|
117 |
+
allLinks = aiimsscrapper.get_links(ur)
|
118 |
+
except:
|
119 |
+
break
|
120 |
+
for link in allLinks:
|
121 |
+
authors = aiimsscrapper.get_author_details(link)
|
122 |
+
for auth in authors:
|
123 |
+
sht.save(auth)
|
124 |
+
if filen == False: # If filen is true then dont need to start the loop
|
125 |
+
break
|
126 |
+
sht.save_to_file()
|
127 |
+
return sht.save_to_file()
|
128 |
+
elif domain == "link.springer.com":
|
129 |
+
# Springer scrapping here
|
130 |
+
for ur in links:
|
131 |
+
isu = 1
|
132 |
+
while True:
|
133 |
+
try:
|
134 |
+
if filen:
|
135 |
+
print(f"Getting data for link {ur.format(i=isu)}")
|
136 |
+
allLinks = springerscrapper.get_all_articals_link(ur.format(i=isu))
|
137 |
+
isu += 1
|
138 |
+
else:
|
139 |
+
print(f"Getting data for link {ur}")
|
140 |
+
allLinks = springerscrapper.get_all_articals_link(ur)
|
141 |
+
except:
|
142 |
+
break
|
143 |
+
for link in allLinks:
|
144 |
+
authors = springerscrapper.get_authors(link)
|
145 |
+
for auth in authors:
|
146 |
+
sht.save(auth)
|
147 |
+
if filen == False: # If filen is true then dont need to start the loop
|
148 |
+
break
|
149 |
+
sht.save_to_file()
|
150 |
+
return sht.save_to_file()
|
151 |
+
elif domain == "www.sciencedirect.com":
|
152 |
+
# Normail scrapping here
|
153 |
+
for ur in links:
|
154 |
+
isu = 1
|
155 |
+
while True:
|
156 |
+
try:
|
157 |
+
if filen:
|
158 |
+
print(f"Getting data for link {ur.format(i=isu)}")
|
159 |
+
allLinks = sciencedirect.run(ur.format(i=isu))
|
160 |
+
isu += 1
|
161 |
+
else:
|
162 |
+
print(f"Getting data for link {ur}")
|
163 |
+
allLinks = sciencedirect.run(ur)
|
164 |
+
except:
|
165 |
+
break
|
166 |
+
for link in allLinks:
|
167 |
+
authors = sciencedirect.stage_two(link)
|
168 |
+
for auth in authors:
|
169 |
+
sht.save(auth)
|
170 |
+
if filen == False: # If filen is true then dont need to start the loop
|
171 |
+
break
|
172 |
+
sht.save_to_file()
|
173 |
+
return sht.save_to_file()
|
174 |
+
elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
|
175 |
+
# acta mathematic scientia data here
|
176 |
+
for ur in links:
|
177 |
+
isu = 1
|
178 |
+
while True:
|
179 |
+
try:
|
180 |
+
if filen:
|
181 |
+
print(f"Getting data for link {ur.format(i=isu)}")
|
182 |
+
allLinks = sciencedirect_admaths.run(ur.format(i=isu))
|
183 |
+
isu += 1
|
184 |
+
else:
|
185 |
+
print(f"Getting data for link {ur}")
|
186 |
+
allLinks = sciencedirect_admaths.run(ur)
|
187 |
+
except:
|
188 |
+
break
|
189 |
+
for link in allLinks:
|
190 |
+
authors = sciencedirect_admaths.stage_two(link)
|
191 |
+
for auth in authors:
|
192 |
+
sht.save(auth)
|
193 |
+
if filen == False: # If filen is true then dont need to start the loop
|
194 |
+
break
|
195 |
+
sht.save_to_file()
|
196 |
+
return sht.save_to_file()
|
197 |
+
else:
|
198 |
+
raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
|
199 |
+
except gr.Error:
|
200 |
+
pass
|
201 |
+
except:
|
202 |
+
with open("ERROR-LOGS.txt", "w") as op:
|
203 |
+
op.write(f"Error {url} : {traceback.format_exc()}")
|
204 |
+
raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
|
205 |
+
|
206 |
+
def handle_url(url, From_volume: int, To_Volume: int, Output: str, Reverse: bool):
|
207 |
+
output = filterUrlandRun(url, From_volume, To_Volume, Reverse, Output)
|
208 |
+
threading.Thread(target=play_sound).start()
|
209 |
+
return output
|
210 |
+
|
211 |
+
interface = gr.Interface(
|
212 |
+
fn=handle_url,
|
213 |
+
inputs=["textbox", "number", "number", "textbox","checkbox"],
|
214 |
+
outputs="file",
|
215 |
+
title="Web Scrapper",
|
216 |
+
description="Enter a URL and download a generated XLSX file."
|
217 |
+
)
|
218 |
+
|
219 |
+
interface.launch()
|
sheets.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openpyxl
|
2 |
+
import os
|
3 |
+
|
4 |
+
class ExcelAutomator:
|
5 |
+
|
6 |
+
def __init__(self, name: list, output: str):
|
7 |
+
self.columns = name
|
8 |
+
self.output = output if output.endswith(".xlsx") else f"{output}.xlsx"
|
9 |
+
|
10 |
+
if os.path.exists(self.output):
|
11 |
+
self.workbook = openpyxl.load_workbook(self.output)
|
12 |
+
self.sheet = self.workbook.active
|
13 |
+
else:
|
14 |
+
self.workbook = openpyxl.Workbook()
|
15 |
+
self.sheet = self.workbook.active
|
16 |
+
for col_num, column_name in enumerate(self.columns, 1):
|
17 |
+
self.sheet.cell(row=1, column=col_num, value=column_name)
|
18 |
+
|
19 |
+
def save(self, data_dict):
|
20 |
+
"""
|
21 |
+
Save a new row of data to the Excel file.
|
22 |
+
:param data_dict: Dictionary with keys as column names and values as the data to save.
|
23 |
+
"""
|
24 |
+
row_data = [data_dict.get(column, None) for column in self.columns]
|
25 |
+
self.sheet.append(row_data)
|
26 |
+
|
27 |
+
def save_to_file(self):
|
28 |
+
"""
|
29 |
+
Save the workbook to a file.
|
30 |
+
"""
|
31 |
+
self.workbook.save(self.output)
|
32 |
+
return self.output
|
springerscrapper.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import json
|
4 |
+
from sheets import ExcelAutomator
|
5 |
+
|
6 |
+
def get_headers(data: str) -> dict:
|
7 |
+
"""This funciton helps to get the headers form the string to the dict
|
8 |
+
|
9 |
+
Args:
|
10 |
+
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
dict: Return the dict or you can say header
|
14 |
+
"""
|
15 |
+
data = data.strip()
|
16 |
+
data = data.split("\n")
|
17 |
+
out = {}
|
18 |
+
for dt in data:
|
19 |
+
key = dt.split(":", 1)[0].strip()
|
20 |
+
value = dt.split(":", 1)[1].strip()
|
21 |
+
|
22 |
+
if value.lower() == "none":
|
23 |
+
value = None
|
24 |
+
elif value.lower() == "true":
|
25 |
+
value = True
|
26 |
+
elif value.lower() == "false":
|
27 |
+
value = False
|
28 |
+
|
29 |
+
out[key] = value
|
30 |
+
return out
|
31 |
+
|
32 |
+
def get_all_articals_link(url: str) -> dict:
|
33 |
+
browser = requests.session()
|
34 |
+
# url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}"
|
35 |
+
headers = """
|
36 |
+
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
|
37 |
+
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
|
38 |
+
Accept-Language: en-US,en;q=0.5
|
39 |
+
Accept-Encoding: gzip, deflate, br
|
40 |
+
Referer: https://link.springer.com/journal/208/volumes-and-issues
|
41 |
+
Alt-Used: link.springer.com
|
42 |
+
Connection: keep-alive
|
43 |
+
Upgrade-Insecure-Requests: 1
|
44 |
+
Sec-Fetch-Dest: document
|
45 |
+
Sec-Fetch-Mode: navigate
|
46 |
+
Sec-Fetch-Site: same-origin
|
47 |
+
Sec-Fetch-User: ?1
|
48 |
+
Sec-GPC: 1
|
49 |
+
TE: trailers
|
50 |
+
"""
|
51 |
+
|
52 |
+
head = get_headers(headers)
|
53 |
+
|
54 |
+
data = browser.get(url, headers=head)
|
55 |
+
|
56 |
+
fullpage = BeautifulSoup(data.text, "lxml")
|
57 |
+
|
58 |
+
orderlist = fullpage.find("ol", {"class" : "u-list-reset"})
|
59 |
+
allLinks = []
|
60 |
+
for dt in orderlist.findAll("li"):
|
61 |
+
if not dt.find("a"):
|
62 |
+
continue
|
63 |
+
allLinks.append(dt.find("a").get("href"))
|
64 |
+
return allLinks
|
65 |
+
|
66 |
+
def get_authors(url: str) -> list:
|
67 |
+
browser = requests.session()
|
68 |
+
headers = """
|
69 |
+
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
|
70 |
+
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
|
71 |
+
Accept-Language: en-US,en;q=0.5
|
72 |
+
Accept-Encoding: gzip, deflate, br
|
73 |
+
Referer: https://link.springer.com/journal/208/volumes-and-issues
|
74 |
+
Alt-Used: link.springer.com
|
75 |
+
Connection: keep-alive
|
76 |
+
Upgrade-Insecure-Requests: 1
|
77 |
+
Sec-Fetch-Dest: document
|
78 |
+
Sec-Fetch-Mode: navigate
|
79 |
+
Sec-Fetch-Site: same-origin
|
80 |
+
Sec-Fetch-User: ?1
|
81 |
+
Sec-GPC: 1
|
82 |
+
TE: trailers
|
83 |
+
"""
|
84 |
+
|
85 |
+
head = get_headers(headers)
|
86 |
+
data = browser.get(url, headers=head)
|
87 |
+
|
88 |
+
main_page = BeautifulSoup(data.text, "lxml")
|
89 |
+
|
90 |
+
json_data = main_page.find("script", {"type" : "application/ld+json"}).text
|
91 |
+
json_data = json.loads(json_data)
|
92 |
+
authors = json_data['mainEntity']['author']
|
93 |
+
output = []
|
94 |
+
for author in authors:
|
95 |
+
if 'email' in author:
|
96 |
+
output.append(
|
97 |
+
{
|
98 |
+
"Name" : author['name'],
|
99 |
+
'Email' : author['email'],
|
100 |
+
'Address' : ", ".join(item['address']['name'] for item in author['affiliation'] if 'address' in item and 'name' in item['address'])
|
101 |
+
}
|
102 |
+
)
|
103 |
+
return output
|
wileyscrapper.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
from sheets import ExcelAutomator
|
4 |
+
from seleliumdriver import WebScraper
|
5 |
+
|
6 |
+
browser = requests.session()
|
7 |
+
|
8 |
+
def save(data:str):
|
9 |
+
with open("data.html", "w") as op:
|
10 |
+
op.write(str(data))
|
11 |
+
|
12 |
+
def get_headers(data: str) -> dict:
|
13 |
+
"""This funciton helps to get the headers form the string to the dict
|
14 |
+
|
15 |
+
Args:
|
16 |
+
data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
dict: Return the dict or you can say header
|
20 |
+
"""
|
21 |
+
data = data.strip()
|
22 |
+
data = data.split("\n")
|
23 |
+
out = {}
|
24 |
+
for dt in data:
|
25 |
+
key = dt.split(":", 1)[0].strip()
|
26 |
+
value = dt.split(":", 1)[1].strip()
|
27 |
+
|
28 |
+
if value.lower() == "none":
|
29 |
+
value = None
|
30 |
+
elif value.lower() == "true":
|
31 |
+
value = True
|
32 |
+
elif value.lower() == "false":
|
33 |
+
value = False
|
34 |
+
|
35 |
+
out[key] = value
|
36 |
+
return out
|
37 |
+
|
38 |
+
def get_links(url: str, issue: int) -> list:
|
39 |
+
headers = """
|
40 |
+
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
|
41 |
+
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
|
42 |
+
Accept-Language: en-US,en;q=0.5
|
43 |
+
Accept-Encoding: gzip, deflate, br
|
44 |
+
Alt-Used: onlinelibrary.wiley.com
|
45 |
+
Connection: keep-alive
|
46 |
+
Cookie: MAID=5SmfjugKsbanANmqY7QFTQ==; MACHINE_LAST_SEEN=2024-07-13T00%3A17%3A54.434-07%3A00; osano_consentmanager_uuid=a09cf48c-a316-44da-a630-b284fe618561; osano_consentmanager=wtFVO73sxrqPK1QjgWvz2PRznZ_IuLc6ARcv2t0_pFtepafXHZgrg-S478uJo9AvbIWsu3sbpgmvaCKL_zNkJQZzpvdHNzGX6NQQ6cwL_c09p-7H9gmYq7lFeOBlGJxYVbwgVIa5TZDqtpLjvla4iYf-rEyPZ0zXi8nZVVY5aCRrKBkWiIYkwIWvpeVeBLepXirD0RkYCGg-O2PWE000CQi4kWVXGTOkNMFqFOSQ-tthQqpC7pvT9AeCAodC2z6CeM6tTjz3TNmp8sTxikwwT4jzZ9HRy76gqQjb8g==; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_identity=CiY4MDg5NTE5MTAxMTg2NDkzMzQzMTI2OTY5MjMzMTU3OTYwODc1N1ITCM6izY3mMRABGAEqBElORDEwAPAB5cnS14oy; Hm_lvt_953dddc9c7bea72022e3bd3ba7782e7b=1720765103,1720855080; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=MCMID|80895191011864933431269692331579608757; JSESSIONID=90BFBDCF8874DBB2B708D37ACC4172DD; __cf_bm=FgCtBcokrG75eoj6.nqj2jTcbcl.vtSPGArq4iAYwYk-1720855074-1.0.1.1-OCKWcrDvKtyaaNLld1aBjaFFwZLoLHauSzJ0NEZFn1JLYK4G4lqmaTMEE50PAzZCReTc13aRgLNyLlqu6JOllleWjBRMQr5vc3YjxJ4kdPs; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_cluster=ind1; cf_clearance=B0r0CEgCWVP2M5CKvRhRTvIW8MyIJM2WBVS14NsHxxE-1720855079-1.0.1.1-CqrZHd19zoe3QCemtBtqxsHiVLXILmnPkb9RjSG2yHndhy.XZzt14jGpjymiEPzjA0nFP7xw1hU6xsXIz6UDSg; Hm_lpvt_953dddc9c7bea72022e3bd3ba7782e7b=1720855160; HMACCOUNT=C851A9F6625CC221; randomizeUser=0.5543043437474287
|
47 |
+
Upgrade-Insecure-Requests: 1
|
48 |
+
Sec-Fetch-Dest: document
|
49 |
+
Sec-Fetch-Mode: navigate
|
50 |
+
Sec-Fetch-Site: none
|
51 |
+
Sec-Fetch-User: ?1
|
52 |
+
Sec-GPC: 1
|
53 |
+
TE: trailers
|
54 |
+
"""
|
55 |
+
# url = f"https://onlinelibrary.wiley.com/toc/14679590/{year}/{volume}/{issue}"
|
56 |
+
data = browser.get(url, headers=get_headers(headers))
|
57 |
+
fullPage = BeautifulSoup(data.text, "lxml")
|
58 |
+
issuelinks = []
|
59 |
+
for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
|
60 |
+
issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')
|
61 |
+
return issuelinks
|
62 |
+
|
63 |
+
def decode_email(encoded_str):
|
64 |
+
key = int(encoded_str[:2], 16)
|
65 |
+
encoded_bytes = bytes.fromhex(encoded_str[2:])
|
66 |
+
decoded_email = ''.join(chr(byte ^ key) for byte in encoded_bytes)
|
67 |
+
return decoded_email
|
68 |
+
|
69 |
+
def get_details(url: str):
|
70 |
+
driver = WebScraper(browser="firefox", hidden=False)
|
71 |
+
driver.get(url)
|
72 |
+
data = driver.get_html()
|
73 |
+
# save(data.text)
|
74 |
+
full_page = BeautifulSoup(data, "lxml")
|
75 |
+
author_detail = full_page.find("div", {"class" : "accordion-tabbed"})
|
76 |
+
output = []
|
77 |
+
save(full_page)
|
78 |
+
for author in author_detail.findAll("span", {"class" : "accordion-tabbed__tab-mobile accordion__closed"}):
|
79 |
+
author_name = author.find("p", {"class" : "author-name"}).text.strip()
|
80 |
+
if author.find("span", {"class" : "__cf_email__"}) == None:
|
81 |
+
continue
|
82 |
+
email = decode_email(author.find("span", {"class" : "__cf_email__"}).get("data-cfemail"))
|
83 |
+
address = author.find("p", {"class" : "author-name"}).findNext("p").text.strip()
|
84 |
+
output.append(
|
85 |
+
{
|
86 |
+
"Name" : author_name,
|
87 |
+
"Email" : email,
|
88 |
+
'Address' : address
|
89 |
+
}
|
90 |
+
)
|
91 |
+
|
92 |
+
return output
|