summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorthomas grothe <grothe.tr@gmail.com>2023-11-12 22:28:03 -0600
committerthomas grothe <grothe.tr@gmail.com>2023-11-12 22:28:03 -0600
commit0e506425dd0e7df58aee749b713a84d65f93bad9 (patch)
treeb6ae0df5c79eb4fb84ac9528dad8f7be71d3c48a
parent1692be1b54344b15550499ecd5d8a469b26187f0 (diff)
added ability to pull links containing some url from webpage
-rwxr-xr-x4chan_search/wwwimgpull.py13
1 files changed, 13 insertions, 0 deletions
diff --git a/4chan_search/wwwimgpull.py b/4chan_search/wwwimgpull.py
index f379fb5..6f341c5 100755
--- a/4chan_search/wwwimgpull.py
+++ b/4chan_search/wwwimgpull.py
@@ -70,6 +70,19 @@ def pullPDFs(url, depth=0, alreadycrawled=[]):
result = result + pullPDFs(url, depth+1, alreadycrawled)
return result
+#return a list of strings of all the links on the given webpage (<a> elements) whose href contains the given search string
+def getLinksContainingStr(url, s):
+ result = []
+ resp = requests.get(url)
+ html = BeautifulSoup(resp.text, 'html.parser')
+ for link in html.find_all('a'):
+ h = link.get('href')
+ if s in h:
+ result.append(h)
+ return result
+
+
+
#if len(sys.argv) < 2:
# sys.exit(0)