H4CK3R-5M4CK3R commited on
Commit
c83cceb
·
1 Parent(s): ccab18e

added docs

Browse files
Files changed (2) hide show
  1. app.py +60 -3
  2. wileyscrapper.py +5 -22
app.py CHANGED
@@ -17,6 +17,38 @@ from builtins import print as original_print
17
  import random
18
  import string
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def generate_random_filename(length=8):
21
  random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
22
  return random_string
@@ -208,6 +240,30 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
208
  break
209
  sht.save_to_file()
210
  return sht.save_to_file()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  else:
212
  raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
213
  except gr.Error:
@@ -216,8 +272,8 @@ def filterUrlandRun(url: str, from_range: int, to_range: int, reverse: bool, out
216
  print(f"ERROR : {traceback.format_exc()}", show=False)
217
  raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
218
 
219
- def handle_url(url, From_volume: int, To_Volume: int, Output: str, Reverse: bool):
220
- output = filterUrlandRun(url, From_volume, To_Volume, Reverse, Output)
221
  # threading.Thread(target=play_sound).start()
222
  return output, gr.Audio("notification.mp3", autoplay=True)
223
 
@@ -226,7 +282,8 @@ interface = gr.Interface(
226
  inputs=["textbox", "number", "number", "textbox","checkbox"],
227
  outputs=["file", "audio"],
228
  title="Web Scrapper",
229
- description="Enter a URL and download a generated XLSX file."
 
230
  )
231
 
232
  interface.launch()
 
17
  import random
18
  import string
19
 
20
+ description = """
21
+ For bug reports or improvements, contact [@H4CK3R_5M4CK3R](https://t.me/H4CK3R_5M4CK3R) on Telegram.
22
+
23
+ **Usage Instructions:**
24
+
25
+ 1. **Single Issue Scraping:**
26
+ - Provide the issue link in the URL section.
27
+ - Optionally, specify the desired output file name.
28
+
29
+ 2. **Multiple Issues Scraping:**
30
+ - Use curly braces `{}` in the URL to indicate where the volume (`v`) and issue (`i`) numbers should be inserted.
31
+ - Define the range for volumes, not issues. Ensure you pass the volume range correctly.
32
+
33
+ **Note:**
34
+ - The range should be the volume range, not the issue range.
35
+ - Some authors may not have a listed address; their profile link will be included in the address section instead.
36
+ - After progress is completed make sure to click on clear because sometimes notification does't ring
37
+ """
38
+
39
+ exmpl = [
40
+ ["https://www.ams.org/journals/jams/2024-37-01/home.html?active=allissues", 0, 0, "example1", False],
41
+ ["https://www.degruyter.com/journal/key/fca/20/2/html", 0, 0, "example2", False],
42
+ ["https://www.degruyter.com/journal/key/fca/{v}/{i}/html", 20, 23, "example3", False],
43
+ ["https://www.aimspress.com/math/article/2024/8/archive-articles", 0, 0, "example4", False],
44
+ ["https://www.aimspress.com/math/article/{v}/{i}/archive-articles", 2021, 2024, "example5", False],
45
+ ["https://link.springer.com/journal/208/volumes-and-issues/388-3", 0, 0, "example6", False],
46
+ ["https://link.springer.com/journal/208/volumes-and-issues/{v}-{i}", 388, 389, "example6", False],
47
+ ["https://www.sciencedirect.com/journal/advances-in-applied-mathematics/vol/158/suppl/C", 0, 0, "example7", False],
48
+ ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/38/issue/6", 0, 0, "example7", False],
49
+ ["https://www.sciencedirect.com/journal/acta-mathematica-scientia/vol/{v}/issue/{i}", 36, 38, "example7", False]
50
+ ]
51
+
52
  def generate_random_filename(length=8):
53
  random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
54
  return random_string
 
240
  break
241
  sht.save_to_file()
242
  return sht.save_to_file()
243
+
244
+ elif domain == "onlinelibrary.wiley.com":
245
+ # acta mathematic scientia data here
246
+ for ur in links:
247
+ isu = 1
248
+ while True:
249
+ try:
250
+ if filen:
251
+ print(f"Getting data for link {ur.format(i=isu)}")
252
+ authors, _ = wileyscrapper.run(ur.format(i=isu))
253
+ isu += 1
254
+ else:
255
+ print(f"Getting data for link {ur}")
256
+ authors, _ = sciencedirect_admaths.run(ur)
257
+ except Exception as e:
258
+ print(f"Error : {traceback.format_exc()}", show=False)
259
+ break
260
+ for auth in authors:
261
+ sht.save(auth)
262
+ if filen == False: # If filen is true then dont need to start the loop
263
+ break
264
+ sht.save_to_file()
265
+ return sht.save_to_file()
266
+
267
  else:
268
  raise gr.Error("Invalid url found contact : @H4CK3R_5M4CK3R on telegram")
269
  except gr.Error:
 
272
  print(f"ERROR : {traceback.format_exc()}", show=False)
273
  raise gr.Error("Something error has occur check your url or contact @h4ck3r_5m4ck3r on telegram")
274
 
275
+ def handle_url(Url, From_volume: int, To_Volume: int, Output: str, Reverse: bool):
276
+ output = filterUrlandRun(Url, From_volume, To_Volume, Reverse, Output)
277
  # threading.Thread(target=play_sound).start()
278
  return output, gr.Audio("notification.mp3", autoplay=True)
279
 
 
282
  inputs=["textbox", "number", "number", "textbox","checkbox"],
283
  outputs=["file", "audio"],
284
  title="Web Scrapper",
285
+ description=description,
286
+ examples=exmpl
287
  )
288
 
289
  interface.launch()
wileyscrapper.py CHANGED
@@ -3,8 +3,6 @@ from bs4 import BeautifulSoup
3
  from sheets import ExcelAutomator
4
  from seleliumdriver import WebScraper
5
 
6
- browser = requests.session()
7
-
8
  def save(data:str):
9
  with open("data.html", "w") as op:
10
  op.write(str(data))
@@ -35,26 +33,11 @@ def get_headers(data: str) -> dict:
35
  out[key] = value
36
  return out
37
 
38
- def get_links(url: str, issue: int) -> list:
39
- headers = """
40
- User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0
41
- Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
42
- Accept-Language: en-US,en;q=0.5
43
- Accept-Encoding: gzip, deflate, br
44
- Alt-Used: onlinelibrary.wiley.com
45
- Connection: keep-alive
46
- Cookie: MAID=5SmfjugKsbanANmqY7QFTQ==; MACHINE_LAST_SEEN=2024-07-13T00%3A17%3A54.434-07%3A00; osano_consentmanager_uuid=a09cf48c-a316-44da-a630-b284fe618561; osano_consentmanager=wtFVO73sxrqPK1QjgWvz2PRznZ_IuLc6ARcv2t0_pFtepafXHZgrg-S478uJo9AvbIWsu3sbpgmvaCKL_zNkJQZzpvdHNzGX6NQQ6cwL_c09p-7H9gmYq7lFeOBlGJxYVbwgVIa5TZDqtpLjvla4iYf-rEyPZ0zXi8nZVVY5aCRrKBkWiIYkwIWvpeVeBLepXirD0RkYCGg-O2PWE000CQi4kWVXGTOkNMFqFOSQ-tthQqpC7pvT9AeCAodC2z6CeM6tTjz3TNmp8sTxikwwT4jzZ9HRy76gqQjb8g==; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_identity=CiY4MDg5NTE5MTAxMTg2NDkzMzQzMTI2OTY5MjMzMTU3OTYwODc1N1ITCM6izY3mMRABGAEqBElORDEwAPAB5cnS14oy; Hm_lvt_953dddc9c7bea72022e3bd3ba7782e7b=1720765103,1720855080; AMCV_1B6E34B85282A0AC0A490D44%40AdobeOrg=MCMID|80895191011864933431269692331579608757; JSESSIONID=90BFBDCF8874DBB2B708D37ACC4172DD; __cf_bm=FgCtBcokrG75eoj6.nqj2jTcbcl.vtSPGArq4iAYwYk-1720855074-1.0.1.1-OCKWcrDvKtyaaNLld1aBjaFFwZLoLHauSzJ0NEZFn1JLYK4G4lqmaTMEE50PAzZCReTc13aRgLNyLlqu6JOllleWjBRMQr5vc3YjxJ4kdPs; kndctr_1B6E34B85282A0AC0A490D44_AdobeOrg_cluster=ind1; cf_clearance=B0r0CEgCWVP2M5CKvRhRTvIW8MyIJM2WBVS14NsHxxE-1720855079-1.0.1.1-CqrZHd19zoe3QCemtBtqxsHiVLXILmnPkb9RjSG2yHndhy.XZzt14jGpjymiEPzjA0nFP7xw1hU6xsXIz6UDSg; Hm_lpvt_953dddc9c7bea72022e3bd3ba7782e7b=1720855160; HMACCOUNT=C851A9F6625CC221; randomizeUser=0.5543043437474287
47
- Upgrade-Insecure-Requests: 1
48
- Sec-Fetch-Dest: document
49
- Sec-Fetch-Mode: navigate
50
- Sec-Fetch-Site: none
51
- Sec-Fetch-User: ?1
52
- Sec-GPC: 1
53
- TE: trailers
54
- """
55
- # url = f"https://onlinelibrary.wiley.com/toc/14679590/{year}/{volume}/{issue}"
56
- data = browser.get(url, headers=get_headers(headers))
57
- fullPage = BeautifulSoup(data.text, "lxml")
58
  issuelinks = []
59
  for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
60
  issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')
 
3
  from sheets import ExcelAutomator
4
  from seleliumdriver import WebScraper
5
 
 
 
6
  def save(data:str):
7
  with open("data.html", "w") as op:
8
  op.write(str(data))
 
33
  out[key] = value
34
  return out
35
 
36
+ def get_links(url: str) -> list:
37
+ browser = WebScraper(browser="firefox", hidden=False)
38
+ browser.get(url) #browser.get(url, headers=get_headers(headers))
39
+ fullPage = BeautifulSoup(browser.get_html(), "lxml")
40
+ save(browser.get_html())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  issuelinks = []
42
  for link in fullPage.findAll("a", {"class" : "issue-item__title visitable"}):
43
  issuelinks.append(f'https://onlinelibrary.wiley.com{link.get("href")}')