diff options
| -rw-r--r-- | 4chan_search/wwwimgpull.py | 29 | ||||
| -rw-r--r-- | update-hosts.service | 4 | ||||
| -rwxr-xr-x | update_hosts | 7 |
3 files changed, 35 insertions, 5 deletions
diff --git a/4chan_search/wwwimgpull.py b/4chan_search/wwwimgpull.py index 6d7ab73..f8eb52c 100644 --- a/4chan_search/wwwimgpull.py +++ b/4chan_search/wwwimgpull.py @@ -2,6 +2,7 @@ import requests from bs4 import BeautifulSoup import re import sys +import time ### # get images from websites @@ -30,6 +31,34 @@ def pullImgs(url): result.append(srcURL) return result +#def pullPDFs(url): +# return pullPDFs(url, 0) + +def pullPDFs(url, depth=0, alreadycrawled=[]): + if depth > 5 or url == '' or url is None or url in alreadycrawled: + return [] + baseurl=url[0:url.find('/', 8)+1] + result = [] + print(url) + resp = requests.get(url) + html = BeautifulSoup(resp.text, 'html.parser') + alreadycrawled.append(url) + for a in html.find_all('a'): + url = a.get('href') + if url is None or url == '' or url == '/': + continue + if baseurl not in url and 'http' in url[0:4]: #this means that the url is pointing to external site + continue + print('found ' + url) + if 'http' not in url: + url = baseurl+url + if url.find('.pdf')>0 and os.path.isfile(url): + result.append(url) + else: + time.sleep(5) + result = result + pullPDFs(url, depth+1, alreadycrawled) + return result + #if len(sys.argv) < 2: # sys.exit(0) diff --git a/update-hosts.service b/update-hosts.service index 6280cf5..2aa6111 100644 --- a/update-hosts.service +++ b/update-hosts.service @@ -1,7 +1,7 @@ [Unit] Description=updates /etc/hosts with my custom hosts -Requires=network-online.target -After=multi-user.target +Requires=network-connected.target +After=multi-user.target network-connected.target [Service] Type=simple diff --git a/update_hosts b/update_hosts index 745eb68..0b3cc5a 100755 --- a/update_hosts +++ b/update_hosts @@ -8,9 +8,10 @@ localcheck=`nmap -sn 192.168.1.0/24 | grep debian` if [[ $localcheck ]]; then hb_ip=`echo $localcheck | sed 's/.*(\(.*\))/\1/g'` else - hb_ip=`ping grothe.ddns.net -c 1 | sed 's/bytes from.*(\(.*\)).*/\1/g'` + hb_ip=`ping grothe.ddns.net -c 1 | grep PING | awk '{print $3}' | sed 's/(\|)//g'` fi -sed -i 's/'${hb_ip}'.*//g' -echo '${hb_ip} hb' >> /etc/hosts +#sed -i 's/'${hb_ip}'.*//g' /etc/hosts +sed -i 's/.*hb//g' /etc/hosts +echo "${hb_ip} hb" >> /etc/hosts |
