diff options
| -rwxr-xr-x | 4chan_search/TODO.txt | 1 | ||||
| -rwxr-xr-x | 4chan_search/pullimgs.py | 18 | ||||
| -rwxr-xr-x | 4chan_search/query.py | 85 | ||||
| -rwxr-xr-x | 4chan_search/slideshow.sh | 16 | ||||
| -rwxr-xr-x | 4chan_search/wwwimgpull.py | 91 | ||||
| -rwxr-xr-x | 4chan_search/wwwimgpull.pyc | bin | 1109 -> 0 bytes |
6 files changed, 0 insertions, 211 deletions
diff --git a/4chan_search/TODO.txt b/4chan_search/TODO.txt deleted file mode 100755 index 3c2702d..0000000 --- a/4chan_search/TODO.txt +++ /dev/null @@ -1 +0,0 @@ -make wwwimgpull into a general web crawl tool diff --git a/4chan_search/pullimgs.py b/4chan_search/pullimgs.py deleted file mode 100755 index af09c4c..0000000 --- a/4chan_search/pullimgs.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/python - -import json -import requests -import sqlite3 -import sys -from wwwimgpull import * -##################################################################### - -if (len(sys.argv) < 2): - print('This program will download all of the images on a given 4chan thread. provide URL. ') - print('you must provide a search word, or \"*\" for any word.') - print('Usage: ./pullimgs.py <url>') - sys.exit(0) - -url = sys.argv[1] -for imgurl in pull4chImgs(url): - print(imgurl) diff --git a/4chan_search/query.py b/4chan_search/query.py deleted file mode 100755 index 89a87d3..0000000 --- a/4chan_search/query.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/python - -import json -import requests -import sqlite3 -import sys -from wwwimgpull import * - - -def processCatalog(catalog, b): - - for i in range(0, len(catalog)): #each page of the board - for j in range(0, len(catalog[i]['threads'])): #each OP on the page - if not 'com' in catalog[i]['threads'][j]: - continue - url = "https://boards.4channel.org/"+b+"/thread/"+str(catalog[i]['threads'][j]['no']) - if wod == "*" or wod == "" or wod.lower() in catalog[i]['threads'][j]['com'].lower(): - results_url.append((url, catalog[i]['threads'][j]['last_modified'])) - results_content.append(catalog[i]['threads'][j]['com']) - for imgurl in pull4chImgs(url): - results_img.append(imgurl) - if not 'last_replies' in catalog[i]['threads'][j]: - continue - for k in range(0, len(catalog[i]['threads'][j]['last_replies'])): #each comment on the OP - r = catalog[i]['threads'][j]['last_replies'][k] - if not 'com' in r: - continue - if wod.lower() in r['com'].lower(): - results_url.append((url+"#p"+str(catalog[i]['threads'][j]['last_replies'][k]['no'],catalog[i]['threads'][j]['last_replies'][k]['last_modified']))) - results_content.append(catalog[i]['threads'][j]['last_replies'][k]['com']) - #imgs were already retrieved from OP grab - -def repliesSort(catalog): - result = [] - for i in range(0, len(catalog)): - for j in range(0, len(catalog[i]['threads'])): - url = "https://boards.4channel.org/"+b+"/thread/"+str(catalog[i]['threads'][j]['no']) - result.append((url, catalog[i]['threads'][j]['replies'])) - - result.sort(key=lambda v: v[1]) - return result - -bods = ['a', 'c', 'w', 'm', 'cgl', 'cm', 'f', 'n', 'jp', 'vp', 'v', 'vg', 'vr', 'co', 'g', 'tv', 'k', 'o', 'an', 'tg', 'sp', 'asp', 'sci', 'int', 'out', 'toy', 'biz', 'i', 'po', 'p', 'ck', 'ic', 'wg', 'mu', 'fa', '3', 'gd', 'diy', 'wsg', 's', 'trv', 'fit', 'x', 'lit', 'adv', 'lgbt', 'mlp', 'b', 'r', 'r9k', 'pol', 'soc', 's4s'] -abods = ['hc', 'hm', 'h', 'e', 'u', 'd', 'y', 't', 'hr', 'gif'] - - -##################################################################### - -r = True #just for debugging right now - -if not r and (len(sys.argv) < 2 or sys.argv[1][0] == '-'): - print('This program will give you the URLs of all 4chan posts that contain the given search word, either on the entire site or on a select board. You can also use wildcard as search word for all posts of board/site') - print('you must provide a search word, or \"*\" for any word.') - print('Usage: ./query.py <searchword> [board]') - sys.exit(0) - -if not r: - wod = sys.argv[1] #the search keyword -bod = '' -if len(sys.argv) > 2: - bod = sys.argv[2] #the board if interest, if given - -results_url = [] #URLs of threads containing the keyword -results_content = [] #text of all posts and comments containing the keyword -results_img = [] #URLs of all images containing the keyword - -if bod == '': - print('searching all boards') - repl_res = [] - for b in bods: - res = requests.get("https://a.4cdn.org/"+b+"/catalog.json") - repl_res.append(repliesSort(json.loads(res.text))) - #processCatalog(json.loads(res.text), b) - repl_res.sort(key=lambda v: v[1]) - print(repl_res) -else: - print('searching board ' + bod) - res = requests.get("https://a.4cdn.org/"+bod+"/catalog.json") - print(repliesSort(json.loads(res.text))) - processCatalog(json.loads(res.text), bod) - - -for url in results_url: - print(url) - diff --git a/4chan_search/slideshow.sh b/4chan_search/slideshow.sh deleted file mode 100755 index 4b60bde..0000000 --- a/4chan_search/slideshow.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -if [[ -z $1 ]]; then - exit 1 -fi - -f=$1 -n=`wc -l ${f} | cut -d' ' -f 1` - -while [[ true ]]; do - i=`shuf -i 0-${n} -n 1` - url=`sed -n ${i}'p' ${f}` - ristretto ${url} & - sleep 30 - killall ristretto -done diff --git a/4chan_search/wwwimgpull.py b/4chan_search/wwwimgpull.py deleted file mode 100755 index 6f341c5..0000000 --- a/4chan_search/wwwimgpull.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/bin/python - -import requests -from bs4 import BeautifulSoup -import re -import sys -import time - -### -# get images from websites -# will probably be updated to include other data -### - -#returns an array of img src urls from a 4chan thread -def pull4chImgs(url): - result = [] - resp = requests.get(url) - html = BeautifulSoup(resp.text, 'html.parser') - for a in html.find_all('a'): - if 'class' in a.attrs and 'fileThumb' in a.attrs['class']: - url = a.get('href') - if url[0:2] == '//': - url = 'https:' + url - result.append(url) - return result - -def pullVids(url): - result = [] - resp = requests.get(url) - html = BeautifulSoup(resp.text, 'html.parser') - for a in html.find_all('a'): - if '.webm' in a.get('href'): - result.append(a.get('href')) - return result - -def pullImgs(url): - result = [] - resp = requests.get(url) - html = BeautifulSoup(resp.text, 'html.parser') - for img in html.find_all('img'): - srcURL = img.get('src') - result.append(srcURL) - return result - -#def pullPDFs(url): -# return pullPDFs(url, 0) - -def pullPDFs(url, depth=0, alreadycrawled=[]): - if depth > 5 or url == '' or url is None or url in alreadycrawled: - return [] - baseurl=url[0:url.find('/', 8)+1] - result = [] - print(url) - resp = requests.get(url) - html = BeautifulSoup(resp.text, 'html.parser') - alreadycrawled.append(url) - for a in html.find_all('a'): - url = a.get('href') - if url is None or url == '' or url == '/': - continue - if baseurl not in url and 'http' in url[0:4]: #this means that the url is pointing to external site - continue - print('found ' + url) - if 'http' not in url: - url = baseurl+url - if url.find('.pdf')>0 and os.path.isfile(url): - result.append(url) - else: - time.sleep(5) - result = result + pullPDFs(url, depth+1, alreadycrawled) - return result - -#return a list of strings of all the links on the given webpage (<a> elements) whose href contains the given search string -def getLinksContainingStr(url, s): - result = [] - resp = requests.get(url) - html = BeautifulSoup(resp.text, 'html.parser') - for link in html.find_all('a'): - h = link.get('href') - if s in h: - result.append(h) - return result - - - -#if len(sys.argv) < 2: -# sys.exit(0) - -#for url in pull4chImgs(sys.argv[1]): -# print(url) - diff --git a/4chan_search/wwwimgpull.pyc b/4chan_search/wwwimgpull.pyc Binary files differdeleted file mode 100755 index b5ebdda..0000000 --- a/4chan_search/wwwimgpull.pyc +++ /dev/null |
