Spaces:
Running
Running
H4CK3R-5M4CK3R
commited on
Commit
·
c83cceb
1
Parent(s):
ccab18e
added docs
Browse files- app.py +60 -3
- wileyscrapper.py +5 -22
app.py
CHANGED
@@ -17,6 +17,38 @@ from builtins import print as original_print
|
|
17 |
import random
|
18 |
import string
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def generate_random_filename(length=8):
|
21 |
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
|
22 |
return random_string
|
@@ -208,6 +240,30 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
|
|
208 |
break
|
209 |
sht.save_to_file()
|
210 |
return sht.save_to_file()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
else:
|
212 |
raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
|
213 |
except gr.Error:
|
@@ -216,8 +272,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
|
|
216 |
print(f"ERROR : {traceback.format_exc()}", show=False)
|
217 |
raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
|
218 |
|
219 |
-
def handle_url(
|
220 |
-
output = filterUrlandRun(
|
221 |
# threading.Thread(target=play_sound).start()
|
222 |
return output, gr.Audio("notification.mp3", autoplay=True)
|
223 |
|
@@ -226,7 +282,8 @@ interface = gr.Interface(
|
|
226 |
inputs=["textbox", "number", "number", "textbox","checkbox"],
|
227 |
outputs=["file", "audio"],
|
228 |
title="Web Scrapper",
|
229 |
-
description=
|
|
|
230 |
)
|
231 |
|
232 |
interface.launch()
|
|
|
17 |
import random
|
18 |
import string
|
19 |
|
20 |
+
description = """
|
21 |
+
For bug reports or improvements, contact [@H4CK3R_5M4CK3R](https://t.me/H4CK3R_5M4CK3R) on Telegram.
|
22 |
+
|
23 |
+
**Usage Instructions:**
|
24 |
+
|
25 |
+
1. **Single Issue Scraping:**
|
26 |
+
- Provide the issue link in the URL section.
|
27 |
+
- Optionally, specify the desired output file name.
|
28 |
+
|
29 |
+
2. **Multiple Issues Scraping:**
|
30 |
+
- Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
|
31 |
+
- Define the range for volumes, not issues. Ensure you pass the volume range correctly.
|
32 |
+
|
33 |
+
**Note:**
|
34 |
+
- The range should be the volume range, not the issue range.
|
35 |
+
- Some authors may not have a listed address; their profile link will be included in the address section instead.
|
36 |
+
- After progress is completed make sure to click on clear because sometimes notification does't ring
|
37 |
+
"""
|
38 |
+
|
39 |
+
exmpl = [
|
40 |
+
["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", 0, 0, "example1", False],
|
41 |
+
["https://www.degruyter.com/journal/key/fca/20/2/html", 0, 0, "example2", False],
|
42 |
+
["https://www.degruyter.com/journal/key/fca/{v}/{i}/html", 20, 23, "example3", False],
|
43 |
+
["https://www.aimspress.com/math/article/2024/8/archive-articles", 0, 0, "example4", False],
|
44 |
+
["https://www.aimspress.com/math/article/{v}/{i}/archive-articles", 2021, 2024, "example5", False],
|
45 |
+
["https://link.springer.com/journal/208/volumes-and-issues/388-3", 0, 0, "example6", False],
|
46 |
+
["https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", 388, 389, "example6", False],
|
47 |
+
["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", 0, 0, "example7", False],
|
48 |
+
["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", 0, 0, "example7", False],
|
49 |
+
["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", 36, 38, "example7", False]
|
50 |
+
]
|
51 |
+
|
52 |
def generate_random_filename(length=8):
|
53 |
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
|
54 |
return random_string
|
|
|
240 |
break
|
241 |
sht.save_to_file()
|
242 |
return sht.save_to_file()
|
243 |
+
|
244 |
+
elif domain == "onlinelibrary.wiley.com":
|
245 |
+
# acta mathematic scientia data here
|
246 |
+
for ur in links:
|
247 |
+
isu = 1
|
248 |
+
while True:
|
249 |
+
try:
|
250 |
+
if filen:
|
251 |
+
print(f"Getting data for link {ur.format(i=isu)}")
|
252 |
+
authors, _ = wileyscrapper.run(ur.format(i=isu))
|
253 |
+
isu += 1
|
254 |
+
else:
|
255 |
+
print(f"Getting data for link {ur}")
|
256 |
+
authors, _ = sciencedirect_admaths.run(ur)
|
257 |
+
except Exception as e:
|
258 |
+
print(f"Error : {traceback.format_exc()}", show=False)
|
259 |
+
break
|
260 |
+
for auth in authors:
|
261 |
+
sht.save(auth)
|
262 |
+
if filen == False: # If filen is true then dont need to start the loop
|
263 |
+
break
|
264 |
+
sht.save_to_file()
|
265 |
+
return sht.save_to_file()
|
266 |
+
|
267 |
else:
|
268 |
raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
|
269 |
except gr.Error:
|
|
|
272 |
print(f"ERROR : {traceback.format_exc()}", show=False)
|
273 |
raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
|
274 |
|
275 |
+
def handle_url(Url, From_volume: int, To_Volume: int, Output: str, Reverse: bool):
|
276 |
+
output = filterUrlandRun(Url, From_volume, To_Volume, Reverse, Output)
|
277 |
# threading.Thread(target=play_sound).start()
|
278 |
return output, gr.Audio("notification.mp3", autoplay=True)
|
279 |
|
|
|
282 |
inputs=["textbox", "number", "number", "textbox","checkbox"],
|
283 |
outputs=["file", "audio"],
|
284 |
title="Web Scrapper",
|
285 |
+
description=description,
|
286 |
+
examples=exmpl
|
287 |
)
|
288 |
|
289 |
interface.launch()
|
wileyscrapper.py
CHANGED
@@ -3,8 +3,6 @@ from bs4 import BeautifulSoup
|
|
3 |
from sheets import ExcelAutomator
|
4 |
from seleliumdriver import WebScraper
|
5 |
|
6 |
-
browser = requests.session()
|
7 |
-
|
8 |
def save(data:str):
|
9 |
with open("data.html", "w") as op:
|
10 |
op.write(str(data))
|
@@ -35,26 +33,11 @@ def get_headers(data: str) -> dict:
|
|
35 |
out[key] = value
|
36 |
return out
|
37 |
|
38 |
-
def get_links(url: str
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
Accept-Encoding: gzip, deflate, br
|
44 |
-
Alt-Used: onlinelibrary.wiley.com
|
45 |
-
Connection: keep-alive
|
46 |
-
Cookie: MAID=5SmfjugKsbanANmqY7QFTQ==; MACHINE_LAST_SEEN=2024-07-13T00%3A17%3A54.434-07%3A00; osano_consentmanager_uuid=a09cf48c-a316-44da-a630-b284fe618561; osano_consentmanager=wtFVO73sxrqPK1QjgWvz2PRznZ_IuLc6ARcv2t0_pFtepafXHZgrg-S478uJo9AvbIWsu3sbpgmvaCKL_zNkJQZzpvdHNzGX6NQQ6cwL_c09p-7H9gmYq7lFeOBlGJxYVbwgVIa5TZDqtpLjvla4iYf-rEyPZ0zXi8nZVVY5aCRrKBkWiIYkwIWvpeVeBLepXirD0RkYCGg-O2PWE000CQi4kWVXGTOkNMFqFOSQ-tthQqpC7pvT9AeCAodC2z6CeM6tTjz3TNmp8sTxikwwT4jzZ9HRy76gqQjb8g==; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_identity=CiY4MDg5NTE5MTAxMTg2NDkzMzQzMTI2OTY5MjMzMTU3OTYwODc1N1ITCM6izY3mMRABGAEqBElORDEwAPAB5cnS14oy; Hm_lvt_953dddc9c7bea72022e3bd3ba7782e7b=1720765103,1720855080; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=MCMID|80895191011864933431269692331579608757; JSESSIONID=90BFBDCF8874DBB2B708D37ACC4172DD; __cf_bm=FgCtBcokrG75eoj6.nqj2jTcbcl.vtSPGArq4iAYwYk-1720855074-1.0.1.1-OCKWcrDvKtyaaNLld1aBjaFFwZLoLHauSzJ0NEZFn1JLYK4G4lqmaTMEE50PAzZCReTc13aRgLNyLlqu6JOllleWjBRMQr5vc3YjxJ4kdPs; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_cluster=ind1; cf_clearance=B0r0CEgCWVP2M5CKvRhRTvIW8MyIJM2WBVS14NsHxxE-1720855079-1.0.1.1-CqrZHd19zoe3QCemtBtqxsHiVLXILmnPkb9RjSG2yHndhy.XZzt14jGpjymiEPzjA0nFP7xw1hU6xsXIz6UDSg; Hm_lpvt_953dddc9c7bea72022e3bd3ba7782e7b=1720855160; HMACCOUNT=C851A9F6625CC221; randomizeUser=0.5543043437474287
|
47 |
-
Upgrade-Insecure-Requests: 1
|
48 |
-
Sec-Fetch-Dest: document
|
49 |
-
Sec-Fetch-Mode: navigate
|
50 |
-
Sec-Fetch-Site: none
|
51 |
-
Sec-Fetch-User: ?1
|
52 |
-
Sec-GPC: 1
|
53 |
-
TE: trailers
|
54 |
-
"""
|
55 |
-
# url = f"https://onlinelibrary.wiley.com/toc/14679590/{year}/{volume}/{issue}"
|
56 |
-
data = browser.get(url, headers=get_headers(headers))
|
57 |
-
fullPage = BeautifulSoup(data.text, "lxml")
|
58 |
issuelinks = []
|
59 |
for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
|
60 |
issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')
|
|
|
3 |
from sheets import ExcelAutomator
|
4 |
from seleliumdriver import WebScraper
|
5 |
|
|
|
|
|
6 |
def save(data:str):
|
7 |
with open("data.html", "w") as op:
|
8 |
op.write(str(data))
|
|
|
33 |
out[key] = value
|
34 |
return out
|
35 |
|
36 |
+
def get_links(url: str) -> list:
|
37 |
+
browser = WebScraper(browser="firefox", hidden=False)
|
38 |
+
browser.get(url) #browser.get(url, headers=get_headers(headers))
|
39 |
+
fullPage = BeautifulSoup(browser.get_html(), "lxml")
|
40 |
+
save(browser.get_html())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
issuelinks = []
|
42 |
for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
|
43 |
issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')
|