summaryrefslogtreecommitdiff
path: root/search4c_oldhb.py
blob: bed39b494de14df81eab4f0a70bf0e0ac99825a9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python3

import json
import requests
import sqlite3
import sys
from wwwimgpull import *


def processCatalog(catalog, b):

    for i in range(0, len(catalog)): #each page of the board
        for j in range(0, len(catalog[i]['threads'])): #each OP on the page
            if not 'com' in catalog[i]['threads'][j]:
                continue
            url = "https://boards.4channel.org/"+b+"/thread/"+str(catalog[i]['threads'][j]['no'])
            if wod == "*" or wod == "" or wod.lower() in catalog[i]['threads'][j]['com'].lower():
                results_url.append(url)
                results_content.append(catalog[i]['threads'][j]['com'])
                for imgurl in pull4chImgs(url):
                    results_img.append(imgurl)
            if not 'last_replies' in catalog[i]['threads'][j]:
                continue
            for k in range(0, len(catalog[i]['threads'][j]['last_replies'])): #each comment on the OP
                r = catalog[i]['threads'][j]['last_replies'][k]
                if not 'com' in r:
                        continue
                if wod.lower() in r['com'].lower():
                    results_url.append(url+"#p"+str(catalog[i]['threads'][j]['last_replies'][k]['no']))
                    results_content.append(catalog[i]['threads'][j]['last_replies'][k]['com'])
                    #imgs were already retrieved from OP grab

bods = ['a', 'c', 'w', 'm', 'cgl', 'cm', 'f', 'n', 'jp', 'vp', 'v', 'vg', 'vr', 'co', 'g', 'tv', 'k', 'o', 'an', 'tg', 'sp', 'asp', 'sci', 'int', 'out', 'toy', 'biz', 'i', 'po', 'p', 'ck', 'ic', 'wg', 'mu', 'fa', '3', 'gd', 'diy', 'wsg', 's', 'trv', 'fit', 'x', 'lit', 'adv', 'lgbt', 'mlp', 'b', 'r', 'r9k', 'pol', 'soc', 's4s']
abods = ['hc', 'hm', 'h', 'e', 'u', 'd', 'y', 't', 'hr', 'gif']


#####################################################################


if len(sys.argv) < 2 or sys.argv[1][0] == '-':
    print('This program will give you the URLs of all 4chan posts that contain the given search word, either on the entire site or on a select board. You can also use wildcard as search word for all posts of board/site')
    print('you must provide a search word, or \"*\" for any word.')
    print('Usage: ./query.py <searchword> [board]')
    sys.exit(0)

wod = sys.argv[1] #the search keyword
bod = ''
if len(sys.argv) > 2:
    bod = sys.argv[2] #the board if interest, if given

results_url = [] #URLs of threads containing the keyword
results_content = [] #text of all posts and comments containing the keyword
results_img = [] #URLs of all images containing the keyword

if bod == '':
    for b in bods:
        res = requests.get("https://a.4cdn.org/"+b+"/catalog.json")
        processCatalog(json.loads(res.text), b)
else:
    res = requests.get("https://a.4cdn.org/"+bod+"/catalog.json")
    processCatalog(json.loads(res.text), bod)


for url in results_url:
    print(url)