H4CK3R-5M4CK3R commited on
Commit
57273d8
·
1 Parent(s): 3f5414c
aiimsscrapper.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from seleliumdriver import WebScraper
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import requests
5
+
6
+ def get_links(url: str):
7
+ browser = WebScraper("firefox", hidden=True)
8
+ browser.get(url)
9
+ time.sleep(5) # Important to sleep to continue using this
10
+ pagehtml = browser.get_html()
11
+ browser.close_browser()
12
+ fullPage = BeautifulSoup(pagehtml, "lxml")
13
+ articals = fullPage.find("div", {"class": "j-archive-article"})
14
+ output = []
15
+ for link in articals.findAll("a", {"class" : "tit ng-binding ng-scope"}):
16
+ output.append("https://www.aimspress.com" + link.get("href"))
17
+ if len(output) < 1:
18
+ raise ValueError("Invalid url found")
19
+ return output
20
+
21
+ def save(dt):
22
+ with open("data.html", "w") as op:
23
+ op.write(str(dt))
24
+ print("Done saved")
25
+
26
+ def get_author_details(url: str):
27
+ browser = requests.session()
28
+ data = browser.get(url)
29
+ fullPage = BeautifulSoup(data.text, "lxml")
30
+ authors = fullPage.find("ul", {"class" : "article-author clear"})
31
+ output = []
32
+ author_about = fullPage.find("ul", {"class" : "about-author"})
33
+ authors_about = [d.text.strip() for d in author_about.findAll("div", {"class" : "lostOf"})]
34
+ for author in authors.findAll("li"):
35
+ author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
36
+ mail = author.find("a", {"class" : "com-mail"})
37
+ if mail:
38
+ mail = mail.get("href").split(":", 1)[1].strip()
39
+ else:
40
+ continue
41
+ try:
42
+ author_value_tag = author.find("a", {"class" : "com-num"}).get("data-tagval")
43
+ if "," in author_value_tag:
44
+ author_value_tag = [int(da) - 1 for da in author_value_tag.split(",")]
45
+ else:
46
+ author_value_tag = [int(author_value_tag) - 1]
47
+ address = None
48
+ for a in author_value_tag:
49
+ if address:
50
+ address = f"{address} & {authors_about[a]}"
51
+ else:
52
+ address = authors_about[a]
53
+ except:
54
+ author_name = author.find("a", {"type" : "authors.authorNameEn"}).text.strip()
55
+ mail = author.find("a", {"class" : "com-mail"}).get("title").split(":", 1)[1].strip()
56
+ address = author.find("a", {"class" : "com-user"}).get("title").split(":", 1)[1].strip()
57
+ output.append(
58
+ {
59
+ "Name" : author_name,
60
+ "Email" : mail,
61
+ "Address" : address
62
+ }
63
+ )
64
+ return output
amsscrapper.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from sheets import ExcelAutomator
4
+
5
+ def get_headers(data: str) -> dict:
6
+ """This funciton helps to get the headers form the string to the dict
7
+
8
+ Args:
9
+ data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
10
+
11
+ Returns:
12
+ dict: Return the dict or you can say header
13
+ """
14
+ data = data.strip()
15
+ data = data.split("\n")
16
+ out = {}
17
+ for dt in data:
18
+ key = dt.split(":", 1)[0].strip()
19
+ value = dt.split(":", 1)[1].strip()
20
+
21
+ if value.lower() == "none":
22
+ value = None
23
+ elif value.lower() == "true":
24
+ value = True
25
+ elif value.lower() == "false":
26
+ value = False
27
+
28
+ out[key] = value
29
+ return out
30
+
31
+ def getlinks(url: str) -> list:
32
+ browser = requests.session()
33
+ # url = f"https://www.ams.org/journals/jams/{year}-{volume}-{issue}/home.html?active=allissues"
34
+ data = browser.get(url)
35
+ fullPage = BeautifulSoup(data.text, "lxml")
36
+ article = fullPage.find("article", {"class" : "contentList"})
37
+ output = []
38
+ lnk = url.split('home.html', 1)[0]
39
+ for allarticle in article.findAll("dl"):
40
+ output.append(f'{lnk}{allarticle.find("dt").find("a").get("href")}')
41
+ return output
42
+
43
+ def get_authors(url: str):
44
+ browser = requests.session()
45
+ data = browser.get(url)
46
+ fullPage = BeautifulSoup(data.text, "lxml")
47
+ details = fullPage.find("section", {"id" : "additionalinformation"})
48
+ email = None
49
+ address = None
50
+ author_name = None
51
+ output = []
52
+ for author in details.findAll("li"):
53
+ if email != None and author_name != None and address != None:
54
+ output.append(
55
+ {
56
+ "Name" : author_name,
57
+ "Email" : email,
58
+ "Address" : address
59
+ }
60
+ )
61
+ email = None
62
+ author_name = None
63
+ address = None
64
+ if author.find("strong"):
65
+ author_name = author.text
66
+ elif "Email:" in author.text:
67
+ email = author.text.split(":", 1)[1].strip()
68
+ elif "Affiliation:" in author.text:
69
+ address = author.text.split(":", 1)[1].strip()
70
+ if author_name == None:
71
+ continue
72
+ return output
degruyterscrapper.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ def get_headers(data: str) -> dict:
5
+ """This funciton helps to get the headers form the string to the dict
6
+
7
+ Args:
8
+ data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
9
+
10
+ Returns:
11
+ dict: Return the dict or you can say header
12
+ """
13
+ data = data.strip()
14
+ data = data.split("\n")
15
+ out = {}
16
+ for dt in data:
17
+ key = dt.split(":", 1)[0].strip()
18
+ value = dt.split(":", 1)[1].strip()
19
+
20
+ if value.lower() == "none":
21
+ value = None
22
+ elif value.lower() == "true":
23
+ value = True
24
+ elif value.lower() == "false":
25
+ value = False
26
+
27
+ out[key] = value
28
+ return out
29
+
30
+ def getLinks(url: str) -> list:
31
+ browser = requests.session()
32
+ # url = f"https://www.degruyter.com/journal/key/fca/{volume}/{issue}/html"
33
+ data = browser.get(url)
34
+ fullPage = BeautifulSoup(data.text, "lxml")
35
+ links = fullPage.find("div", {"id" : "issue-subject-group-researchpaper"})
36
+ output = []
37
+ for link in links.findAll("div", {"class" : "text-container"}):
38
+ link = link.find("a", {"class" : "issueContentsArticleLink linkHoverDark d-inline-block"}).get("href")
39
+ output.append(f"https://www.degruyter.com{link}")
40
+ return output
41
+
42
+ def get_author_details(url: str) -> list:
43
+ browser = requests.session()
44
+ data = browser.get(url)
45
+ authors = BeautifulSoup(data.text, "lxml")
46
+ authors = authors.find("ul", {"class" : "contributors list-unstyled mb-2"})
47
+ output = []
48
+ for author in authors.findAll("span", {"class" : "contributor"}):
49
+ author_name = author.text.strip()
50
+ author_address = author.find("contributor-popdown").get("affiliations").strip()
51
+ email = author.find("contributor-popdown").get("email").strip()
52
+ if len(email.strip()) < 1:
53
+ continue
54
+ output.append(
55
+ {
56
+ "Name" : author_name,
57
+ "Email" : email,
58
+ "Address" : author_address
59
+ }
60
+ )
61
+ return output
docs.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Steps to install
2
+
3
+ - Make sure that python are installed and also pip you can use `python` command to check if python is installed or not.
4
+ - Also make sure to check if pip is installed or not by using the command `pip` if pip is not working then you can use command `python -m pip install -r requirements.txt` or if python command is not working then you can use `python3 -m pip install -r requirements.txt`.
5
+ - If `python3` is working then make sure to edit the file `run.bat` and change `python` to `python3`
6
+ - Then double click on the `install.bat` if `python` command is working fine.
7
+ - Then `pip install webdriver-manager` if this command does not work then use `python -m pip install webdriver-manager` if python command does not work then use python3 insted
8
+ - Now everything is up and running
9
+
10
+ ## Steps to use
11
+
12
+ - To use the script just double click on the `run.bat` file
13
+ - Then open `http://127.0.0.1:7860` this link in the browser and you are good to go
install.bat ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python -m pip install -r requirements.txt
2
+ echo "Done installed success"
notification.mp3 ADDED
Binary file (931 kB). View file
 
out.xlsx ADDED
Binary file (105 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ bs4
2
+ pygame
3
+ gradio
4
+ openpyxl
5
+ selenium
6
+ requests
7
+ webdriver-manager
run.bat ADDED
@@ -0,0 +1 @@
 
 
1
+ python server.py
sciencedirect.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from requests import session
3
+ from bs4 import BeautifulSoup
4
+ import base64
5
+ import urllib.parse
6
+ import traceback
7
+ import json
8
+ from sheets import ExcelAutomator
9
+
10
+ req = session()
11
+
12
+ def get_headers(data: str) -> dict:
13
+ data = data.strip()
14
+ data = data.split("\n")
15
+ out = {}
16
+ for dt in data:
17
+ key = dt.split(":", 1)[0].strip()
18
+ value = dt.split(":", 1)[1].strip()
19
+
20
+ if value.lower() == "none":
21
+ value = None
22
+ elif value.lower() == "true":
23
+ value = True
24
+ elif value.lower() == "false":
25
+ value = False
26
+
27
+ out[key] = value
28
+ return out
29
+
30
+ def get_email_from_encoding(encoded_str):
31
+ base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
32
+ url_decoded = urllib.parse.unquote(base64_decoded)
33
+ decoded_json = json.loads(url_decoded)
34
+ try:
35
+ if decoded_json["#name"] == 'e-address':
36
+ if decoded_json['$']['type'] == 'email':
37
+ if 'href' in decoded_json['$']:
38
+ if 'mailto:' in decoded_json['$']['href']:
39
+ return decoded_json['$']['href'].replace("mailto:", "")
40
+ else:
41
+ return None
42
+ else:
43
+ return decoded_json['_']
44
+ else:
45
+ return None
46
+ else:
47
+ return None
48
+ except Exception as e:
49
+ with open("jsondata.json", "w") as op:
50
+ json.dump(decoded_json, op)
51
+ print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
52
+ exit()
53
+
54
+ def run(url: str, last_artical_name: str=None) -> tuple:
55
+ """This function helps to get the detail from the first site
56
+
57
+ Args:
58
+ volume (int): Pass the volume number
59
+ issue (int): Pass the issue number
60
+
61
+ Returns:
62
+ tuple : It includes auth data and page title
63
+ """
64
+ headers = """
65
+ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
66
+ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
67
+ Accept-Language: en-US,en;q=0.5
68
+ Accept-Encoding: gzip, deflate, br
69
+ Connection: keep-alive
70
+ Upgrade-Insecure-Requests: 1
71
+ Sec-Fetch-Dest: document
72
+ Sec-Fetch-Mode: navigate
73
+ Sec-Fetch-Site: none
74
+ Sec-Fetch-User: ?1
75
+ Sec-GPC: 1
76
+ """
77
+
78
+ headers = get_headers(headers)
79
+
80
+ # url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
81
+
82
+ data = req.get(url, headers=headers)
83
+
84
+ artical_links = []
85
+ fullpage = BeautifulSoup(str(data.text), "lxml")
86
+ if fullpage.title.string.strip() == last_artical_name:
87
+ return None, fullpage.title.string.strip()
88
+ for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
89
+ artical_links.append("https://www.sciencedirect.com" + link.get("href"))
90
+ print(f"Total artical found : {len(artical_links)}")
91
+ n = 1
92
+ auth = []
93
+ print(f"Getting all artical from - {fullpage.title.string}")
94
+ for li in artical_links:
95
+ print(f"Fetching data of {n} artical")
96
+ authors = stage_two(li)
97
+ auth.extend(authors)
98
+ n += 1
99
+ return auth, fullpage.title.string.strip()
100
+
101
+ def stage_two(url: str) -> list:
102
+
103
+ headers = """
104
+ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
105
+ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
106
+ Accept-Language: en-US,en;q=0.5
107
+ Accept-Encoding: gzip, deflate, br
108
+ Connection: keep-alive
109
+ Upgrade-Insecure-Requests: 1
110
+ Sec-Fetch-Dest: document
111
+ Sec-Fetch-Mode: navigate
112
+ Sec-Fetch-Site: none
113
+ Sec-Fetch-User: ?1
114
+ Sec-GPC: 1
115
+ """
116
+ headers = get_headers(headers)
117
+
118
+ data = req.get(url, headers=headers)
119
+ page = BeautifulSoup(data.text, "lxml")
120
+ json_data = page.find("script", {"type" : "application/json"})
121
+ json_data = json.loads(json_data.text.strip())
122
+ authors_detail = []
123
+ address = json_data['authors']['affiliations']
124
+ n = 1
125
+ if len(json_data['authors']['content']) < 1:
126
+ return authors_detail
127
+ if not '$$' in json_data['authors']['content'][0]:
128
+ with open("jsondata.json", "w") as op:
129
+ json.dump(json_data, op, indent=4)
130
+ print("ERROR Check jsondata file")
131
+ exit()
132
+ address = "Not Found"
133
+ addr = []
134
+ authr = []
135
+ email = None
136
+ for author in json_data['authors']['content'][0]['$$']:
137
+ if author['#name'] == 'author':
138
+ # Its author data
139
+ author_name = " "
140
+ for au in author['$$']:
141
+ if au['#name'] == 'given-name' or au['#name'] == 'name':
142
+ author_name = au['_'] + author_name
143
+ if au['#name'] == 'surname':
144
+ author_name = f"{author_name}{au['_']}"
145
+ if au['#name'] == 'encoded-e-address':
146
+ email = get_email_from_encoding(au['__encoded'])
147
+ if email:
148
+ authr.append(
149
+ {
150
+ 'Name' : author_name,
151
+ 'Email' : email
152
+ }
153
+ )
154
+ else:
155
+ continue
156
+ if author['#name'] == 'affiliation':
157
+ for cor in author['$$']:
158
+ if '_' in cor:
159
+ if address == "Not Found":
160
+ address = cor['_']
161
+ else:
162
+ address = f"{address} {cor['_']}"
163
+ addr.append(address)
164
+
165
+ output = []
166
+ for aut in authr:
167
+ try:
168
+ address = addr[authr.index(aut)]
169
+ except:
170
+ address = "Not Found"
171
+ if address == "Not Found":
172
+ address = url
173
+ output.append(
174
+ {
175
+ 'Name' : aut['Name'],
176
+ 'Email' : aut['Email'],
177
+ 'Address' : address
178
+ }
179
+ )
180
+ return output
181
+
182
+ def get_author_info_specific(vol: int, issue: int) -> list:
183
+ print(f"Getting detail of volume {vol} and issue {issue}")
184
+ data, page_title = run(vol, issue)
185
+ return data
186
+
187
+ def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
188
+ allAuthors = []
189
+ last_page_title = None
190
+ for i in range(from_vol, to_vol + 1):
191
+ print(f"Getting data of vol {i}")
192
+ d = 1
193
+ while True:
194
+ try:
195
+ data, page_title = run(i, d, last_page_title)
196
+ if last_page_title == page_title:
197
+ print(f"All issues covered of vol {i} changing volume")
198
+ print("--------------------------------------------------------------------------")
199
+ break
200
+ else:
201
+ last_page_title = page_title
202
+ allAuthors.extend(data)
203
+ print(f"Issue {d} data recieved total authors : {len(allAuthors)}")
204
+ except Exception as e:
205
+ print(f"ERROR : {traceback.format_exc()}")
206
+ print(f"All issues covered of vol {i}")
207
+ print("--------------------------------------------------------------------------")
208
+ break
209
+ d += 1
210
+ return allAuthors
sciencedirect_admaths.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from requests import session
3
+ from bs4 import BeautifulSoup
4
+ import base64
5
+ import urllib.parse
6
+ import traceback
7
+ import json
8
+ from sheets import ExcelAutomator
9
+
10
+ req = session()
11
+
12
+ def get_headers(data: str) -> dict:
13
+ data = data.strip()
14
+ data = data.split("\n")
15
+ out = {}
16
+ for dt in data:
17
+ key = dt.split(":", 1)[0].strip()
18
+ value = dt.split(":", 1)[1].strip()
19
+
20
+ if value.lower() == "none":
21
+ value = None
22
+ elif value.lower() == "true":
23
+ value = True
24
+ elif value.lower() == "false":
25
+ value = False
26
+
27
+ out[key] = value
28
+ return out
29
+
30
+ def get_email_from_encoding(encoded_str):
31
+ try:
32
+ base64_decoded = base64.b64decode(encoded_str).decode('utf-8')
33
+ url_decoded = urllib.parse.unquote(base64_decoded)
34
+ decoded_json = json.loads(url_decoded)
35
+ try:
36
+ if decoded_json["#name"] == 'e-address':
37
+ if decoded_json['$']['type'] == 'email':
38
+ if 'href' in decoded_json['$']:
39
+ if 'mailto:' in decoded_json['$']['href']:
40
+ return decoded_json['$']['href'].replace("mailto:", "")
41
+ else:
42
+ return None
43
+ else:
44
+ return decoded_json['_']
45
+ else:
46
+ return None
47
+ else:
48
+ return None
49
+ except Exception as e:
50
+ with open("jsondata.json", "w") as op:
51
+ json.dump(decoded_json, op)
52
+ print(f"ERROR : {e},\n---------------------------------------------------------\n{traceback.format_exc()}\n\n---------------------------------------------------------")
53
+ exit()
54
+ except:
55
+ return None
56
+
57
+ def run(url: str, last_artical_name: str=None) -> tuple:
58
+ """This function helps to get the detail from the first site
59
+
60
+ Args:
61
+ volume (int): Pass the volume number
62
+ issue (int): Pass the issue number
63
+
64
+ Returns:
65
+ tuple : It includes auth data and page title
66
+ """
67
+ headers = """
68
+ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
69
+ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
70
+ Accept-Language: en-US,en;q=0.5
71
+ Accept-Encoding: gzip, deflate, br
72
+ Connection: keep-alive
73
+ Upgrade-Insecure-Requests: 1
74
+ Sec-Fetch-Dest: document
75
+ Sec-Fetch-Mode: navigate
76
+ Sec-Fetch-Site: none
77
+ Sec-Fetch-User: ?1
78
+ Sec-GPC: 1
79
+ """
80
+
81
+ headers = get_headers(headers)
82
+
83
+ # url = f"https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/{volume}/suppl/C"
84
+
85
+ data = req.get(url, headers=headers)
86
+
87
+ artical_links = []
88
+ fullpage = BeautifulSoup(str(data.text), "lxml")
89
+ if fullpage.title.string.strip() == last_artical_name:
90
+ return None, fullpage.title.string.strip()
91
+ for link in fullpage.findAll("a", {"class" : "anchor article-content-title u-margin-xs-top u-margin-s-bottom anchor-default"}):
92
+ artical_links.append("https://www.sciencedirect.com" + link.get("href"))
93
+ print(f"Total artical found : {len(artical_links)}")
94
+ n = 1
95
+ auth = []
96
+ print(f"Getting all artical from - {fullpage.title.string}")
97
+ for li in artical_links:
98
+ print(f"Fetching data of {n} artical")
99
+ authors = stage_two(li)
100
+ auth.extend(authors)
101
+ n += 1
102
+ return auth, fullpage.title.string.strip()
103
+
104
+ def stage_two(url: str) -> list:
105
+
106
+ headers = """
107
+ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
108
+ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
109
+ Accept-Language: en-US,en;q=0.5
110
+ Accept-Encoding: gzip, deflate, br
111
+ Connection: keep-alive
112
+ Upgrade-Insecure-Requests: 1
113
+ Sec-Fetch-Dest: document
114
+ Sec-Fetch-Mode: navigate
115
+ Sec-Fetch-Site: none
116
+ Sec-Fetch-User: ?1
117
+ Sec-GPC: 1
118
+ """
119
+ headers = get_headers(headers)
120
+
121
+ data = req.get(url, headers=headers)
122
+ page = BeautifulSoup(data.text, "lxml")
123
+ json_data = page.find("script", {"type" : "application/json"})
124
+ json_data = json.loads(json_data.text.strip())
125
+ authors_detail = []
126
+ address = json_data['authors']['affiliations']
127
+ n = 1
128
+ if len(json_data['authors']['content']) < 1:
129
+ return authors_detail
130
+ if not '$$' in json_data['authors']['content'][0]:
131
+ with open("jsondata.json", "w") as op:
132
+ json.dump(json_data, op, indent=4)
133
+ print("ERROR Check jsondata file")
134
+ exit()
135
+ address = "Not Found"
136
+ addr = []
137
+ authr = []
138
+ for author in json_data['authors']['content'][0]['$$']:
139
+ if author['#name'] == 'author':
140
+ # Its author data
141
+ author_name = " "
142
+ for au in author['$$']:
143
+ if au['#name'] == 'given-name' or au['#name'] == 'name':
144
+ author_name = au['_'] + author_name
145
+ if au['#name'] == 'surname':
146
+ author_name = f"{author_name}{au['_']}"
147
+ if au['#name'] == 'encoded-e-address':
148
+ email = get_email_from_encoding(au['__encoded'])
149
+ if email:
150
+ authr.append(
151
+ {
152
+ 'Name' : author_name,
153
+ 'Email' : email
154
+ }
155
+ )
156
+ if author['#name'] == 'affiliation':
157
+ for cor in author['$$']:
158
+ if '_' in cor:
159
+ if address == "Not Found":
160
+ address = cor['_']
161
+ else:
162
+ address = f"{address} {cor['_']}"
163
+ addr.append(address)
164
+
165
+ output = []
166
+ for aut in authr:
167
+ try:
168
+ address = addr[authr.index(aut)]
169
+ except:
170
+ address = "Not Found"
171
+ if address == "Not Found":
172
+ address = url
173
+ output.append(
174
+ {
175
+ 'Name' : aut['Name'],
176
+ 'Email' : aut['Email'],
177
+ 'Address' : address
178
+ }
179
+ )
180
+ return output
181
+
182
+ def get_author_info_specific(vol: int) -> list:
183
+ print(f"Getting detail of volume {vol}")
184
+ data, page_title = run(vol)
185
+ return data
186
+
187
+ def get_author_info_in_range(from_vol: int, to_vol: int) -> list:
188
+ allAuthors = []
189
+ last_page_title = None
190
+ for i in range(from_vol, to_vol + 1):
191
+ print(f"Getting data of vol {i}")
192
+ try:
193
+ data, page_title = run(i, last_page_title)
194
+ if last_page_title == page_title:
195
+ print(f"All issues covered of vol {i} changing volume")
196
+ print("--------------------------------------------------------------------------")
197
+ break
198
+ else:
199
+ last_page_title = page_title
200
+ allAuthors.extend(data)
201
+ print(f"Data recieved total authors : {len(allAuthors)}")
202
+ except Exception as e:
203
+ print(f"ERROR : {traceback.format_exc()}")
204
+ print(f"All issues covered of vol {i}")
205
+ print("--------------------------------------------------------------------------")
206
+ break
207
+ return allAuthors
seleliumdriver.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.service import Service as ChromeService
3
+ from selenium.webdriver.firefox.service import Service as FirefoxService
4
+ from webdriver_manager.chrome import ChromeDriverManager
5
+ from webdriver_manager.firefox import GeckoDriverManager
6
+ from selenium.webdriver.common.by import By
7
+ from selenium.webdriver.support.ui import WebDriverWait
8
+ from selenium.webdriver.support import expected_conditions as EC
9
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
10
+ from selenium.webdriver.firefox.options import Options as FirefoxOptions
11
+
12
+ class WebScraper:
13
+
14
+ def __init__(self, browser='chrome', hidden=True):
15
+ if browser.lower() == 'chrome':
16
+ options = ChromeOptions()
17
+ if hidden:
18
+ options.add_argument('--headless')
19
+ options.add_argument('--window-size=1920,1200')
20
+ self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
21
+ elif browser.lower() == 'firefox':
22
+ options = FirefoxOptions()
23
+ if hidden:
24
+ options.add_argument('--headless')
25
+ options.add_argument('--window-size=1920,1200')
26
+ self.driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()), options=options)
27
+ else:
28
+ raise ValueError('Unsupported browser. Only "chrome" and "firefox" are supported.')
29
+
30
+ def get(self, url, wait_time=10):
31
+ self.driver.get(url)
32
+ WebDriverWait(self.driver, wait_time).until(
33
+ EC.presence_of_element_located((By.TAG_NAME, 'body'))
34
+ )
35
+
36
+ def get_html(self):
37
+ return self.driver.page_source
38
+
39
+ def close_browser(self):
40
+ self.driver.quit()
server.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import aiimsscrapper
3
+ import amsscrapper
4
+ import degruyterscrapper
5
+ import sciencedirect
6
+ import sciencedirect_admaths
7
+ import springerscrapper
8
+ import wileyscrapper
9
+ from urllib.parse import urlparse
10
+ import traceback
11
+ from sheets import ExcelAutomator
12
+ import pygame
13
+ import threading
14
+ from datetime import datetime
15
+ import os
16
+
17
+ def play_sound():
18
+ pygame.mixer.init()
19
+ pygame.mixer.music.load("notification.mp3") # Ensure this file exists
20
+ pygame.mixer.music.play()
21
+
22
+ def print(data: str):
23
+ if not os.path.exists("LOGS.txt"):
24
+ with open("LOGS.txt", "w") as op:
25
+ op.write(f"{datetime.now().hour}-{datetime.now().minute}-{datetime.now().second}.{datetime.now().microsecond}/{datetime.now().day}-{datetime.now().month}-{datetime.now().year}\n------------------------------------------------------------------\n")
26
+ with open("LOGS.txt", "a") as op:
27
+ op.write(f"\n{datetime.now().hour}-{datetime.now().minute}-{datetime.now().second}.{datetime.now().microsecond}/{datetime.now().day}-{datetime.now().month}-{datetime.now().year} -> {data}")
28
+ gr.Info(data, duration=3)
29
+
30
+ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str) -> str:
31
+ url_sch = urlparse(url)
32
+ domain = url_sch.hostname
33
+ sht = ExcelAutomator([
34
+ "Name",
35
+ "Email",
36
+ "Address"
37
+ ],
38
+ output
39
+ )
40
+ filen = True
41
+ if "{" in url:
42
+ links = []
43
+ if reverse:
44
+ for vol in reversed(range(from_range, to_range)):
45
+ print(url)
46
+ links.append(url.format(v=vol, i="{i}"))
47
+ else:
48
+ for vol in range(from_range, to_range):
49
+ print(url)
50
+ links.append(url.format(v=vol, i="{i}"))
51
+ else:
52
+ links = [url]
53
+ filen = False
54
+ print(f"Total links found {len(links)}")
55
+ try:
56
+ if domain == "www.ams.org" or domain == "ams.org":
57
+ # AMS Scrapper
58
+ for ur in links:
59
+ isu = 1
60
+ while True:
61
+ if len(str(isu)) < 2:
62
+ isu = f"0{isu}"
63
+ try:
64
+ if filen:
65
+ print(f"Getting data for link {ur.format(i=isu)}")
66
+
67
+ allLinks = amsscrapper.getlinks(ur.format(i=isu))
68
+ isu += 1
69
+ else:
70
+ print(f"Getting data for link {ur}")
71
+ allLinks = amsscrapper.getlinks(ur)
72
+ except:
73
+ print("Error")
74
+ break
75
+ for link in allLinks:
76
+ authors = amsscrapper.get_authors(link)
77
+ for auth in authors:
78
+ sht.save(auth)
79
+ if filen == False: # If filen is true then dont need to start the loop
80
+ break
81
+ sht.save_to_file()
82
+ return sht.save_to_file()
83
+ elif domain == "www.degruyter.com" or domain == "degruyter.com":
84
+ # Degruyter scrapper
85
+ for ur in links:
86
+ isu = 1
87
+ while True:
88
+ try:
89
+ if filen:
90
+ print(f"Getting data for link {ur.format(i=isu)}")
91
+ allLinks = degruyterscrapper.getLinks(ur.format(i=isu))
92
+ isu += 1
93
+ else:
94
+ print(f"Getting data for link {ur}")
95
+ allLinks = degruyterscrapper.getLinks(ur)
96
+ except:
97
+ break
98
+ for link in allLinks:
99
+ authors = degruyterscrapper.get_author_details(link)
100
+ for auth in authors:
101
+ sht.save(auth)
102
+ if filen == False: # If filen is true then dont need to start the loop
103
+ break
104
+ sht.save_to_file()
105
+ return sht.save_to_file()
106
+ elif domain == "www.aimspress.com" or domain == "aimspress.com":
107
+ for ur in links:
108
+ isu = 1
109
+ while True:
110
+ try:
111
+ if filen:
112
+ print(f"Getting data for link {ur.format(i=isu)}")
113
+ allLinks = aiimsscrapper.get_links(ur.format(i=isu))
114
+ isu += 1
115
+ else:
116
+ print(f"Getting data for link {ur}")
117
+ allLinks = aiimsscrapper.get_links(ur)
118
+ except:
119
+ break
120
+ for link in allLinks:
121
+ authors = aiimsscrapper.get_author_details(link)
122
+ for auth in authors:
123
+ sht.save(auth)
124
+ if filen == False: # If filen is true then dont need to start the loop
125
+ break
126
+ sht.save_to_file()
127
+ return sht.save_to_file()
128
+ elif domain == "link.springer.com":
129
+ # Springer scrapping here
130
+ for ur in links:
131
+ isu = 1
132
+ while True:
133
+ try:
134
+ if filen:
135
+ print(f"Getting data for link {ur.format(i=isu)}")
136
+ allLinks = springerscrapper.get_all_articals_link(ur.format(i=isu))
137
+ isu += 1
138
+ else:
139
+ print(f"Getting data for link {ur}")
140
+ allLinks = springerscrapper.get_all_articals_link(ur)
141
+ except:
142
+ break
143
+ for link in allLinks:
144
+ authors = springerscrapper.get_authors(link)
145
+ for auth in authors:
146
+ sht.save(auth)
147
+ if filen == False: # If filen is true then dont need to start the loop
148
+ break
149
+ sht.save_to_file()
150
+ return sht.save_to_file()
151
+ elif domain == "www.sciencedirect.com":
152
+ # Normail scrapping here
153
+ for ur in links:
154
+ isu = 1
155
+ while True:
156
+ try:
157
+ if filen:
158
+ print(f"Getting data for link {ur.format(i=isu)}")
159
+ allLinks = sciencedirect.run(ur.format(i=isu))
160
+ isu += 1
161
+ else:
162
+ print(f"Getting data for link {ur}")
163
+ allLinks = sciencedirect.run(ur)
164
+ except:
165
+ break
166
+ for link in allLinks:
167
+ authors = sciencedirect.stage_two(link)
168
+ for auth in authors:
169
+ sht.save(auth)
170
+ if filen == False: # If filen is true then dont need to start the loop
171
+ break
172
+ sht.save_to_file()
173
+ return sht.save_to_file()
174
+ elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
175
+ # acta mathematic scientia data here
176
+ for ur in links:
177
+ isu = 1
178
+ while True:
179
+ try:
180
+ if filen:
181
+ print(f"Getting data for link {ur.format(i=isu)}")
182
+ allLinks = sciencedirect_admaths.run(ur.format(i=isu))
183
+ isu += 1
184
+ else:
185
+ print(f"Getting data for link {ur}")
186
+ allLinks = sciencedirect_admaths.run(ur)
187
+ except:
188
+ break
189
+ for link in allLinks:
190
+ authors = sciencedirect_admaths.stage_two(link)
191
+ for auth in authors:
192
+ sht.save(auth)
193
+ if filen == False: # If filen is true then dont need to start the loop
194
+ break
195
+ sht.save_to_file()
196
+ return sht.save_to_file()
197
+ else:
198
+ raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
199
+ except gr.Error:
200
+ pass
201
+ except:
202
+ with open("ERROR-LOGS.txt", "w") as op:
203
+ op.write(f"Error {url} : {traceback.format_exc()}")
204
+ raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
205
+
206
+ def handle_url(url, From_volume: int, To_Volume: int, Output: str, Reverse: bool):
207
+ output = filterUrlandRun(url, From_volume, To_Volume, Reverse, Output)
208
+ threading.Thread(target=play_sound).start()
209
+ return output
210
+
211
+ interface = gr.Interface(
212
+ fn=handle_url,
213
+ inputs=["textbox", "number", "number", "textbox","checkbox"],
214
+ outputs="file",
215
+ title="Web Scrapper",
216
+ description="Enter a URL and download a generated XLSX file."
217
+ )
218
+
219
+ interface.launch()
sheets.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openpyxl
2
+ import os
3
+
4
+ class ExcelAutomator:
5
+
6
+ def __init__(self, name: list, output: str):
7
+ self.columns = name
8
+ self.output = output if output.endswith(".xlsx") else f"{output}.xlsx"
9
+
10
+ if os.path.exists(self.output):
11
+ self.workbook = openpyxl.load_workbook(self.output)
12
+ self.sheet = self.workbook.active
13
+ else:
14
+ self.workbook = openpyxl.Workbook()
15
+ self.sheet = self.workbook.active
16
+ for col_num, column_name in enumerate(self.columns, 1):
17
+ self.sheet.cell(row=1, column=col_num, value=column_name)
18
+
19
+ def save(self, data_dict):
20
+ """
21
+ Save a new row of data to the Excel file.
22
+ :param data_dict: Dictionary with keys as column names and values as the data to save.
23
+ """
24
+ row_data = [data_dict.get(column, None) for column in self.columns]
25
+ self.sheet.append(row_data)
26
+
27
+ def save_to_file(self):
28
+ """
29
+ Save the workbook to a file.
30
+ """
31
+ self.workbook.save(self.output)
32
+ return self.output
springerscrapper.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ from sheets import ExcelAutomator
5
+
6
+ def get_headers(data: str) -> dict:
7
+ """This funciton helps to get the headers form the string to the dict
8
+
9
+ Args:
10
+ data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
11
+
12
+ Returns:
13
+ dict: Return the dict or you can say header
14
+ """
15
+ data = data.strip()
16
+ data = data.split("\n")
17
+ out = {}
18
+ for dt in data:
19
+ key = dt.split(":", 1)[0].strip()
20
+ value = dt.split(":", 1)[1].strip()
21
+
22
+ if value.lower() == "none":
23
+ value = None
24
+ elif value.lower() == "true":
25
+ value = True
26
+ elif value.lower() == "false":
27
+ value = False
28
+
29
+ out[key] = value
30
+ return out
31
+
32
+ def get_all_articals_link(url: str) -> dict:
33
+ browser = requests.session()
34
+ # url = f"https://link.springer.com/journal/208/volumes-and-issues/{volume}-{issue}"
35
+ headers = """
36
+ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
37
+ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
38
+ Accept-Language: en-US,en;q=0.5
39
+ Accept-Encoding: gzip, deflate, br
40
+ Referer: https://link.springer.com/journal/208/volumes-and-issues
41
+ Alt-Used: link.springer.com
42
+ Connection: keep-alive
43
+ Upgrade-Insecure-Requests: 1
44
+ Sec-Fetch-Dest: document
45
+ Sec-Fetch-Mode: navigate
46
+ Sec-Fetch-Site: same-origin
47
+ Sec-Fetch-User: ?1
48
+ Sec-GPC: 1
49
+ TE: trailers
50
+ """
51
+
52
+ head = get_headers(headers)
53
+
54
+ data = browser.get(url, headers=head)
55
+
56
+ fullpage = BeautifulSoup(data.text, "lxml")
57
+
58
+ orderlist = fullpage.find("ol", {"class" : "u-list-reset"})
59
+ allLinks = []
60
+ for dt in orderlist.findAll("li"):
61
+ if not dt.find("a"):
62
+ continue
63
+ allLinks.append(dt.find("a").get("href"))
64
+ return allLinks
65
+
66
+ def get_authors(url: str) -> list:
67
+ browser = requests.session()
68
+ headers = """
69
+ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
70
+ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
71
+ Accept-Language: en-US,en;q=0.5
72
+ Accept-Encoding: gzip, deflate, br
73
+ Referer: https://link.springer.com/journal/208/volumes-and-issues
74
+ Alt-Used: link.springer.com
75
+ Connection: keep-alive
76
+ Upgrade-Insecure-Requests: 1
77
+ Sec-Fetch-Dest: document
78
+ Sec-Fetch-Mode: navigate
79
+ Sec-Fetch-Site: same-origin
80
+ Sec-Fetch-User: ?1
81
+ Sec-GPC: 1
82
+ TE: trailers
83
+ """
84
+
85
+ head = get_headers(headers)
86
+ data = browser.get(url, headers=head)
87
+
88
+ main_page = BeautifulSoup(data.text, "lxml")
89
+
90
+ json_data = main_page.find("script", {"type" : "application/ld+json"}).text
91
+ json_data = json.loads(json_data)
92
+ authors = json_data['mainEntity']['author']
93
+ output = []
94
+ for author in authors:
95
+ if 'email' in author:
96
+ output.append(
97
+ {
98
+ "Name" : author['name'],
99
+ 'Email' : author['email'],
100
+ 'Address' : ", ".join(item['address']['name'] for item in author['affiliation'] if 'address' in item and 'name' in item['address'])
101
+ }
102
+ )
103
+ return output
wileyscrapper.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from sheets import ExcelAutomator
4
+ from seleliumdriver import WebScraper
5
+
6
+ browser = requests.session()
7
+
8
+ def save(data:str):
9
+ with open("data.html", "w") as op:
10
+ op.write(str(data))
11
+
12
+ def get_headers(data: str) -> dict:
13
+ """This funciton helps to get the headers form the string to the dict
14
+
15
+ Args:
16
+ data (str): Pass the headers as a string (You can go to firefox click on copy and copy request or response header and it will convert it to th e)
17
+
18
+ Returns:
19
+ dict: Return the dict or you can say header
20
+ """
21
+ data = data.strip()
22
+ data = data.split("\n")
23
+ out = {}
24
+ for dt in data:
25
+ key = dt.split(":", 1)[0].strip()
26
+ value = dt.split(":", 1)[1].strip()
27
+
28
+ if value.lower() == "none":
29
+ value = None
30
+ elif value.lower() == "true":
31
+ value = True
32
+ elif value.lower() == "false":
33
+ value = False
34
+
35
+ out[key] = value
36
+ return out
37
+
38
+ def get_links(url: str, issue: int) -> list:
39
+ headers = """
40
+ User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
41
+ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
42
+ Accept-Language: en-US,en;q=0.5
43
+ Accept-Encoding: gzip, deflate, br
44
+ Alt-Used: onlinelibrary.wiley.com
45
+ Connection: keep-alive
46
+ Cookie: MAID=5SmfjugKsbanANmqY7QFTQ==; MACHINE_LAST_SEEN=2024-07-13T00%3A17%3A54.434-07%3A00; osano_consentmanager_uuid=a09cf48c-a316-44da-a630-b284fe618561; osano_consentmanager=wtFVO73sxrqPK1QjgWvz2PRznZ_IuLc6ARcv2t0_pFtepafXHZgrg-S478uJo9AvbIWsu3sbpgmvaCKL_zNkJQZzpvdHNzGX6NQQ6cwL_c09p-7H9gmYq7lFeOBlGJxYVbwgVIa5TZDqtpLjvla4iYf-rEyPZ0zXi8nZVVY5aCRrKBkWiIYkwIWvpeVeBLepXirD0RkYCGg-O2PWE000CQi4kWVXGTOkNMFqFOSQ-tthQqpC7pvT9AeCAodC2z6CeM6tTjz3TNmp8sTxikwwT4jzZ9HRy76gqQjb8g==; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_identity=CiY4MDg5NTE5MTAxMTg2NDkzMzQzMTI2OTY5MjMzMTU3OTYwODc1N1ITCM6izY3mMRABGAEqBElORDEwAPAB5cnS14oy; Hm_lvt_953dddc9c7bea72022e3bd3ba7782e7b=1720765103,1720855080; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=MCMID|80895191011864933431269692331579608757; JSESSIONID=90BFBDCF8874DBB2B708D37ACC4172DD; __cf_bm=FgCtBcokrG75eoj6.nqj2jTcbcl.vtSPGArq4iAYwYk-1720855074-1.0.1.1-OCKWcrDvKtyaaNLld1aBjaFFwZLoLHauSzJ0NEZFn1JLYK4G4lqmaTMEE50PAzZCReTc13aRgLNyLlqu6JOllleWjBRMQr5vc3YjxJ4kdPs; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_cluster=ind1; cf_clearance=B0r0CEgCWVP2M5CKvRhRTvIW8MyIJM2WBVS14NsHxxE-1720855079-1.0.1.1-CqrZHd19zoe3QCemtBtqxsHiVLXILmnPkb9RjSG2yHndhy.XZzt14jGpjymiEPzjA0nFP7xw1hU6xsXIz6UDSg; Hm_lpvt_953dddc9c7bea72022e3bd3ba7782e7b=1720855160; HMACCOUNT=C851A9F6625CC221; randomizeUser=0.5543043437474287
47
+ Upgrade-Insecure-Requests: 1
48
+ Sec-Fetch-Dest: document
49
+ Sec-Fetch-Mode: navigate
50
+ Sec-Fetch-Site: none
51
+ Sec-Fetch-User: ?1
52
+ Sec-GPC: 1
53
+ TE: trailers
54
+ """
55
+ # url = f"https://onlinelibrary.wiley.com/toc/14679590/{year}/{volume}/{issue}"
56
+ data = browser.get(url, headers=get_headers(headers))
57
+ fullPage = BeautifulSoup(data.text, "lxml")
58
+ issuelinks = []
59
+ for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
60
+ issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')
61
+ return issuelinks
62
+
63
+ def decode_email(encoded_str):
64
+ key = int(encoded_str[:2], 16)
65
+ encoded_bytes = bytes.fromhex(encoded_str[2:])
66
+ decoded_email = ''.join(chr(byte ^ key) for byte in encoded_bytes)
67
+ return decoded_email
68
+
69
+ def get_details(url: str):
70
+ driver = WebScraper(browser="firefox", hidden=False)
71
+ driver.get(url)
72
+ data = driver.get_html()
73
+ # save(data.text)
74
+ full_page = BeautifulSoup(data, "lxml")
75
+ author_detail = full_page.find("div", {"class" : "accordion-tabbed"})
76
+ output = []
77
+ save(full_page)
78
+ for author in author_detail.findAll("span", {"class" : "accordion-tabbed__tab-mobile accordion__closed"}):
79
+ author_name = author.find("p", {"class" : "author-name"}).text.strip()
80
+ if author.find("span", {"class" : "__cf_email__"}) == None:
81
+ continue
82
+ email = decode_email(author.find("span", {"class" : "__cf_email__"}).get("data-cfemail"))
83
+ address = author.find("p", {"class" : "author-name"}).findNext("p").text.strip()
84
+ output.append(
85
+ {
86
+ "Name" : author_name,
87
+ "Email" : email,
88
+ 'Address' : address
89
+ }
90
+ )
91
+
92
+ return output