added ability to pull links containing some url from webpage

author: thomas grothe <grothe.tr@gmail.com> 2023-11-12 22:28:03 -0600
committer: thomas grothe <grothe.tr@gmail.com> 2023-11-12 22:28:03 -0600
commit: 0e506425dd0e7df58aee749b713a84d65f93bad9 (patch)
tree: b6ae0df5c79eb4fb84ac9528dad8f7be71d3c48a
parent: 1692be1b54344b15550499ecd5d8a469b26187f0 (diff)
1 files changed, 13 insertions, 0 deletions
diff --git a/4chan_search/wwwimgpull.py b/4chan_search/wwwimgpull.py
index f379fb5..6f341c5 100755
--- a/4chan_search/wwwimgpull.py
+++ b/4chan_search/wwwimgpull.py
@@ -70,6 +70,19 @@ def pullPDFs(url, depth=0, alreadycrawled=[]):
             result = result + pullPDFs(url, depth+1, alreadycrawled)
     return result
 
+#return a list of strings of all the links on the given webpage (<a> elements) whose href contains the given search string
+def getLinksContainingStr(url, s):
+    result = []
+    resp = requests.get(url)
+    html = BeautifulSoup(resp.text, 'html.parser')
+    for link in html.find_all('a'):
+        h = link.get('href')
+        if s in h:
+            result.append(h)
+    return result
+
+
+
 #if len(sys.argv) < 2:
 #    sys.exit(0)
author	thomas grothe <grothe.tr@gmail.com>	2023-11-12 22:28:03 -0600
committer	thomas grothe <grothe.tr@gmail.com>	2023-11-12 22:28:03 -0600
commit	0e506425dd0e7df58aee749b713a84d65f93bad9 (patch)
tree	b6ae0df5c79eb4fb84ac9528dad8f7be71d3c48a
parent	1692be1b54344b15550499ecd5d8a469b26187f0 (diff)