Spaces:
Running
Running
H4CK3R-5M4CK3R
commited on
Commit
·
7bed2d8
1
Parent(s):
0b55168
added multiple urls scrapper
Browse files- app.py +61 -27
- sciencedirect.py +2 -2
- sciencedirect_admaths.py +1 -1
app.py
CHANGED
@@ -12,7 +12,6 @@ import os
|
|
12 |
import random
|
13 |
import string
|
14 |
import json
|
15 |
-
import base64
|
16 |
import crypto
|
17 |
|
18 |
auth = [
|
@@ -33,6 +32,8 @@ For bug reports or improvements, contact [@H4CK3R_5M4CK3R](https://t.me/H4CK3R_5
|
|
33 |
2. **Multiple Issues Scraping:**
|
34 |
- Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
|
35 |
- Define the range for volumes, not issues. Ensure you pass the volume range correctly.
|
|
|
|
|
36 |
|
37 |
3. **Read this before using google sheet feature**
|
38 |
- **IMPORTANT** First make a google drive folder and then gave access to `[email protected]` and `[email protected]` super important.
|
@@ -55,16 +56,17 @@ If you are copying this space make sure to contact the owner as mention above
|
|
55 |
"""
|
56 |
|
57 |
exmpl = [
|
58 |
-
["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues",
|
59 |
-
["https://www.degruyter.com/journal/key/fca/20/2/html",
|
60 |
-
["https://www.degruyter.com/journal/key/fca/{v}/{i}/html",
|
61 |
-
["https://www.aimspress.com/math/article/2024/8/archive-articles",
|
62 |
-
["https://www.aimspress.com/math/article/{v}/{i}/archive-articles",
|
63 |
-
["https://link.springer.com/journal/208/volumes-and-issues/388-3",
|
64 |
-
["https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}",
|
65 |
-
["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C",
|
66 |
-
["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6",
|
67 |
-
["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}",
|
|
|
68 |
]
|
69 |
|
70 |
stop_work = False
|
@@ -73,7 +75,7 @@ def generate_random_filename(length=8):
|
|
73 |
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
|
74 |
return random_string
|
75 |
|
76 |
-
def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool):
|
77 |
if len(output.strip()) < 1:
|
78 |
output = generate_random_filename()
|
79 |
if os.path.exists(f"{output}.xlsx"):
|
@@ -84,8 +86,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
|
|
84 |
shet = GoogleSheetAutomator(
|
85 |
[
|
86 |
"Name",
|
87 |
-
"
|
88 |
-
"
|
89 |
],
|
90 |
folder_id,
|
91 |
outputfile=output,
|
@@ -93,8 +95,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
|
|
93 |
)
|
94 |
sht = ExcelAutomator([
|
95 |
"Name",
|
96 |
-
"
|
97 |
-
"
|
98 |
], output)
|
99 |
filen = True
|
100 |
if "{" in url:
|
@@ -266,6 +268,7 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
|
|
266 |
yield {"final_output": sht.save_to_file(), "link" : ""}
|
267 |
|
268 |
elif domain == "www.sciencedirect.com":
|
|
|
269 |
for ur in links:
|
270 |
isu = 1
|
271 |
while True:
|
@@ -274,11 +277,19 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
|
|
274 |
try:
|
275 |
if filen:
|
276 |
current_url = ur.format(i=isu)
|
277 |
-
authors,
|
|
|
|
|
|
|
|
|
278 |
isu += 1
|
279 |
else:
|
280 |
current_url = ur
|
281 |
-
authors,
|
|
|
|
|
|
|
|
|
282 |
except Exception as e:
|
283 |
break
|
284 |
yield {"current_url": current_url, "status": "fetching"}
|
@@ -303,6 +314,7 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
|
|
303 |
yield {"final_output": sht.save_to_file(), "link" : ""}
|
304 |
|
305 |
elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
|
|
|
306 |
for ur in links:
|
307 |
isu = 1
|
308 |
while True:
|
@@ -311,11 +323,19 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
|
|
311 |
try:
|
312 |
if filen:
|
313 |
current_url = ur.format(i=isu)
|
314 |
-
authors,
|
|
|
|
|
|
|
|
|
315 |
isu += 1
|
316 |
else:
|
317 |
current_url = ur
|
318 |
-
authors,
|
|
|
|
|
|
|
|
|
319 |
except Exception as e:
|
320 |
break
|
321 |
yield {"current_url": current_url, "status": "fetching"}
|
@@ -354,25 +374,35 @@ def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool
|
|
354 |
authors = []
|
355 |
details = []
|
356 |
final_output = None
|
|
|
357 |
link = None
|
358 |
From_volume = 0
|
359 |
To_Volume = 0
|
360 |
urls = []
|
361 |
if "\n" in Link:
|
362 |
urls = Link.split("\n")
|
|
|
363 |
else:
|
364 |
urls.append(Link)
|
|
|
|
|
365 |
for url in urls:
|
366 |
if url.startswith("("):
|
367 |
-
From_volume = url.split("(", 1)[1].split(":")[0]
|
368 |
-
To_Volume = url.split("(", 1)[1].split(":")[1].split(")", 1)[0]
|
369 |
url = url.split("-", 1)[1].strip()
|
370 |
-
|
371 |
-
|
|
|
|
|
|
|
372 |
|
373 |
-
for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet)):
|
374 |
if "final_output" in result:
|
375 |
-
|
|
|
|
|
|
|
376 |
link = result["link"]
|
377 |
else:
|
378 |
if "author" in result:
|
@@ -383,11 +413,15 @@ def handle_url(Link: str, Gmail: str, FolderId: str, Output: str, MakeOwner:bool
|
|
383 |
details.append(f"Scraping: {current_url}\n")
|
384 |
authors = authors[-3:] if len(authors) > 3 else authors
|
385 |
details = details[-3:] if len(details) > 3 else details
|
|
|
|
|
386 |
yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link
|
|
|
|
|
387 |
|
388 |
interface = gr.Interface(
|
389 |
fn=handle_url,
|
390 |
-
inputs=["
|
391 |
outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"],
|
392 |
title="Web Scraper",
|
393 |
description=description,
|
|
|
12 |
import random
|
13 |
import string
|
14 |
import json
|
|
|
15 |
import crypto
|
16 |
|
17 |
auth = [
|
|
|
32 |
2. **Multiple Issues Scraping:**
|
33 |
- Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
|
34 |
- Define the range for volumes, not issues. Ensure you pass the volume range correctly.
|
35 |
+
- To pass the range you must have use () in start of the url something like (to:from)-link/{v}/{i} checkout example below.
|
36 |
+
- Now you can also pass the multiple links too and add range into same as above.
|
37 |
|
38 |
3. **Read this before using google sheet feature**
|
39 |
- **IMPORTANT** First make a google drive folder and then gave access to `[email protected]` and `[email protected]` super important.
|
|
|
56 |
"""
|
57 |
|
58 |
exmpl = [
|
59 |
+
["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", "[email protected]", "asdfasdfasdfasdfasdf", "example1", True, True, False],
|
60 |
+
["https://www.degruyter.com/journal/key/fca/20/2/html", "[email protected]", "asdfasdfasdfasdfasdf", "example2", True, True, False],
|
61 |
+
["(22:23)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example3", True, True, False],
|
62 |
+
["https://www.aimspress.com/math/article/2024/8/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example4", True, True, False],
|
63 |
+
["(2021:2022)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles", "[email protected]", "asdfasdfasdfasdfasdf", "example5", True, True, False],
|
64 |
+
["https://link.springer.com/journal/208/volumes-and-issues/388-3", "[email protected]", "asdfasdfasdfasdfasdf", "example6", True, True, False],
|
65 |
+
["(388:389)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example7", True, True, False],
|
66 |
+
["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", "[email protected]", "asdfasdfasdfasdfasdf", "example8", True, True, False],
|
67 |
+
["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", "[email protected]", "asdfasdfasdfasdfasdf", "example9", True, True, False],
|
68 |
+
["(37:38)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", "[email protected]", "asdfasdfasdfasdfasdf", "example10", True, True, False],
|
69 |
+
["(37:38)-https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}\n(388:389)-https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}\n(2021:2022)-https://www.aimspress.com/math/article/{v}/{i}/archive-articles\n(22:23)-https://www.degruyter.com/journal/key/fca/{v}/{i}/html", "[email protected]", "asdfasdfasdfasdfasdf", "example11", True, True, False]
|
70 |
]
|
71 |
|
72 |
stop_work = False
|
|
|
75 |
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
|
76 |
return random_string
|
77 |
|
78 |
+
def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, output: str, owner: bool, mail: str, folder_id: str, credit: dict, usegooglesheet: bool, multiUrls: bool):
|
79 |
if len(output.strip()) < 1:
|
80 |
output = generate_random_filename()
|
81 |
if os.path.exists(f"{output}.xlsx"):
|
|
|
86 |
shet = GoogleSheetAutomator(
|
87 |
[
|
88 |
"Name",
|
89 |
+
"Address",
|
90 |
+
"Email"
|
91 |
],
|
92 |
folder_id,
|
93 |
outputfile=output,
|
|
|
95 |
)
|
96 |
sht = ExcelAutomator([
|
97 |
"Name",
|
98 |
+
"Address",
|
99 |
+
"Email"
|
100 |
], output)
|
101 |
filen = True
|
102 |
if "{" in url:
|
|
|
268 |
yield {"final_output": sht.save_to_file(), "link" : ""}
|
269 |
|
270 |
elif domain == "www.sciencedirect.com":
|
271 |
+
oldtitle = ""
|
272 |
for ur in links:
|
273 |
isu = 1
|
274 |
while True:
|
|
|
277 |
try:
|
278 |
if filen:
|
279 |
current_url = ur.format(i=isu)
|
280 |
+
authors, title = sciencedirect.run(current_url)
|
281 |
+
if title == oldtitle:
|
282 |
+
break
|
283 |
+
else:
|
284 |
+
oldtitle = title
|
285 |
isu += 1
|
286 |
else:
|
287 |
current_url = ur
|
288 |
+
authors, title = sciencedirect.run(current_url)
|
289 |
+
if title == oldtitle:
|
290 |
+
break
|
291 |
+
else:
|
292 |
+
oldtitle = title
|
293 |
except Exception as e:
|
294 |
break
|
295 |
yield {"current_url": current_url, "status": "fetching"}
|
|
|
314 |
yield {"final_output": sht.save_to_file(), "link" : ""}
|
315 |
|
316 |
elif domain == "www.sciencedirect.com" and "acta-mathematica-scientia" in url:
|
317 |
+
oldtitle = ""
|
318 |
for ur in links:
|
319 |
isu = 1
|
320 |
while True:
|
|
|
323 |
try:
|
324 |
if filen:
|
325 |
current_url = ur.format(i=isu)
|
326 |
+
authors, title = sciencedirect_admaths.run(current_url)
|
327 |
+
if title == oldtitle:
|
328 |
+
break
|
329 |
+
else:
|
330 |
+
oldtitle = title
|
331 |
isu += 1
|
332 |
else:
|
333 |
current_url = ur
|
334 |
+
authors, title = sciencedirect_admaths.run(current_url)
|
335 |
+
if title == oldtitle:
|
336 |
+
break
|
337 |
+
else:
|
338 |
+
oldtitle = title
|
339 |
except Exception as e:
|
340 |
break
|
341 |
yield {"current_url": current_url, "status": "fetching"}
|
|
|
374 |
authors = []
|
375 |
details = []
|
376 |
final_output = None
|
377 |
+
multi_output = None
|
378 |
link = None
|
379 |
From_volume = 0
|
380 |
To_Volume = 0
|
381 |
urls = []
|
382 |
if "\n" in Link:
|
383 |
urls = Link.split("\n")
|
384 |
+
multiUrls = True
|
385 |
else:
|
386 |
urls.append(Link)
|
387 |
+
multiUrls = False
|
388 |
+
print(f"URLS : {urls}")
|
389 |
for url in urls:
|
390 |
if url.startswith("("):
|
391 |
+
From_volume = int(url.split("(", 1)[1].split(":")[0])
|
392 |
+
To_Volume = int(url.split("(", 1)[1].split(":")[1].split(")", 1)[0])
|
393 |
url = url.split("-", 1)[1].strip()
|
394 |
+
try:
|
395 |
+
credit = crypto.decrypt(os.getenv("KEY"), os.getenv("GOOGLE_AUTH_CREDENTIALS"))
|
396 |
+
credit = json.loads(credit)
|
397 |
+
except:
|
398 |
+
pass
|
399 |
|
400 |
+
for _, result in enumerate(filterUrlandRun(url, From_volume, To_Volume, Reverse, Output, MakeOwner, Gmail, FolderId, credit=credit, usegooglesheet=UseGoogleSheet, multiUrls=multiUrls)):
|
401 |
if "final_output" in result:
|
402 |
+
if multiUrls == False:
|
403 |
+
final_output = result["final_output"]
|
404 |
+
else:
|
405 |
+
multi_output = result["final_output"]
|
406 |
link = result["link"]
|
407 |
else:
|
408 |
if "author" in result:
|
|
|
413 |
details.append(f"Scraping: {current_url}\n")
|
414 |
authors = authors[-3:] if len(authors) > 3 else authors
|
415 |
details = details[-3:] if len(details) > 3 else details
|
416 |
+
if multiUrls:
|
417 |
+
final_output = None
|
418 |
yield "\n".join(authors), "\n".join(details), final_output if final_output else None, gr.Audio("notification.mp3", autoplay=True) if final_output else None, link
|
419 |
+
if multiUrls == True:
|
420 |
+
yield "\n".join(authors), "\n".join(details), multi_output, gr.Audio("notification.mp3", autoplay=True), link
|
421 |
|
422 |
interface = gr.Interface(
|
423 |
fn=handle_url,
|
424 |
+
inputs=[gr.TextArea(label="Url / Url's", placeholder="Enter the url or multiple urls to scrap"), gr.Textbox(label="Access Gmail (Check Docs)"), gr.Textbox(label="Google Folder ID (Check Docs)"), gr.Textbox(label="Output File Name"), gr.Checkbox(True, label="Make Owner"), gr.Checkbox(True, label="Use Google Sheets"), "checkbox"],
|
425 |
outputs=[gr.Markdown("LOGS", height="250px", elem_id="authorscroll"), gr.Markdown("", height="100px", elem_id="authorscroll"), "file", "audio", "textbox"],
|
426 |
title="Web Scraper",
|
427 |
description=description,
|
sciencedirect.py
CHANGED
@@ -80,7 +80,7 @@ Sec-GPC: 1
|
|
80 |
# url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
|
81 |
|
82 |
data = req.get(url, headers=headers)
|
83 |
-
|
84 |
artical_links = []
|
85 |
fullpage = BeautifulSoup(str(data.text), "lxml")
|
86 |
if fullpage.title.string.strip() == last_artical_name:
|
@@ -169,7 +169,7 @@ Sec-GPC: 1
|
|
169 |
except:
|
170 |
address = "Not Found"
|
171 |
if address == "Not Found":
|
172 |
-
address =
|
173 |
output.append(
|
174 |
{
|
175 |
'Name' : aut['Name'],
|
|
|
80 |
# url = f"https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{volume}/issue/{issue}"
|
81 |
|
82 |
data = req.get(url, headers=headers)
|
83 |
+
print(f"URL : {data.url}")
|
84 |
artical_links = []
|
85 |
fullpage = BeautifulSoup(str(data.text), "lxml")
|
86 |
if fullpage.title.string.strip() == last_artical_name:
|
|
|
169 |
except:
|
170 |
address = "Not Found"
|
171 |
if address == "Not Found":
|
172 |
+
address = ""
|
173 |
output.append(
|
174 |
{
|
175 |
'Name' : aut['Name'],
|
sciencedirect_admaths.py
CHANGED
@@ -169,7 +169,7 @@ Sec-GPC: 1
|
|
169 |
except:
|
170 |
address = "Not Found"
|
171 |
if address == "Not Found":
|
172 |
-
address =
|
173 |
output.append(
|
174 |
{
|
175 |
'Name' : aut['Name'],
|
|
|
169 |
except:
|
170 |
address = "Not Found"
|
171 |
if address == "Not Found":
|
172 |
+
address = ""
|
173 |
output.append(
|
174 |
{
|
175 |
'Name' : aut['Name'],
|