1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
|
#!/usr/bin/python3
import json
import requests
import sqlite3
import sys
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz, process
from wwwimgpull import *
import argparse
bods = ['a', 'c', 'w', 'm', 'cgl', 'cm', 'f', 'n', 'jp', 'vp', 'v', 'vg', 'vr', 'co', 'g', 'tv', 'k', 'o', 'an', 'tg', 'sp', 'asp', 'sci', 'int', 'out', 'toy', 'biz', 'i', 'po', 'p', 'ck', 'ic', 'wg', 'mu', 'fa', '3', 'gd', 'diy', 'wsg', 's', 'trv', 'fit', 'x', 'lit', 'adv', 'lgbt', 'mlp', 'b', 'r', 'r9k', 'pol', 'soc', 's4s']
abods = ['hc', 'hm', 'h', 'e', 'u', 'd', 'y', 't', 'hr', 'gif']
class FuzzySearchConfig:
def __init__(self):
self.min_ratio = 60 # Minimum similarity ratio (0-100)
self.partial_ratio_weight = 0.7
self.token_sort_weight = 0.3
self.enable_partial = True
self.enable_token_sort = True
def fuzzy_match(search_term, text, config):
"""
Performs fuzzy matching with configurable algorithms
Returns True if match is found, False otherwise
"""
if not text or not search_term:
return False
# Handle wildcard
if search_term == "*" or search_term == "":
return True
search_lower = search_term.lower()
text_lower = text.lower()
# Exact match (highest priority)
if search_lower in text_lower:
return True
# Fuzzy matching using different algorithms
scores = []
# Basic ratio
basic_ratio = fuzz.ratio(search_lower, text_lower)
scores.append(basic_ratio)
# Partial ratio (good for substring matching)
if config.enable_partial:
partial_ratio = fuzz.partial_ratio(search_lower, text_lower)
scores.append(partial_ratio * config.partial_ratio_weight)
# Token sort ratio (good for word order differences)
if config.enable_token_sort:
token_sort_ratio = fuzz.token_sort_ratio(search_lower, text_lower)
scores.append(token_sort_ratio * config.token_sort_weight)
# Token set ratio (handles duplicates and order)
token_set_ratio = fuzz.token_set_ratio(search_lower, text_lower)
scores.append(token_set_ratio)
# Use the best score
best_score = max(scores) if scores else 0
return best_score >= config.min_ratio
def processCatalog(catalog, b, search_config):
for i in range(0, len(catalog)): #each page of the board
for j in range(0, len(catalog[i]['threads'])): #each OP on the page
if not 'com' in catalog[i]['threads'][j]:
continue
url = "https://boards.4channel.org/"+b+"/thread/"+str(catalog[i]['threads'][j]['no'])
# Use fuzzy matching instead of simple string containment
if fuzzy_match(wod, catalog[i]['threads'][j]['com'], search_config):
results_url.append((url, catalog[i]['threads'][j]['last_modified']))
results_content.append(catalog[i]['threads'][j]['com'])
for imgurl in pull4chImgs(url):
results_img.append(imgurl)
if not 'last_replies' in catalog[i]['threads'][j]:
continue
for k in range(0, len(catalog[i]['threads'][j]['last_replies'])): #each comment on the OP
r = catalog[i]['threads'][j]['last_replies'][k]
if not 'com' in r:
continue
# Use fuzzy matching for replies too
if fuzzy_match(wod, r['com'], search_config):
results_url.append((url+"#p"+str(catalog[i]['threads'][j]['last_replies'][k]['no'])))
results_content.append(catalog[i]['threads'][j]['last_replies'][k]['com'])
#imgs were already retrieved from OP grab
#def processThread(thread):
def repliesSort(catalog):
result = []
for i in range(0, len(catalog)):
for j in range(0, len(catalog[i]['threads'])):
url = "https://boards.4channel.org/"+bod+"/thread/"+str(catalog[i]['threads'][j]['no'])
result.append((url, catalog[i]['threads'][j]['replies']))
result.sort(key=lambda v: v[1])
return result
def print_usage():
print('This program will give you the URLs of all 4chan posts that contain the given search word, either on the entire site or on a select board.')
print('It supports fuzzy search for typos and partial matches.')
print('Usage: ./query.py <searchword> [board] [options]')
print('Options:')
print(' --fuzzy-ratio <0-100> Set minimum fuzzy match ratio (default: 60)')
print(' --exact-only Disable fuzzy search, use exact matching only')
print(' --strict Use stricter fuzzy matching (ratio: 80)')
print(' --loose Use looser fuzzy matching (ratio: 40)')
print('Examples:')
print(' ./query.py "programming" g # Search for "programming" on /g/')
print(' ./query.py "programing" g --fuzzy # Will also match "programming"')
print(' ./query.py "linux" --strict # Strict matching across all boards')
def parseargs():
global v
global wod
global bod
parser = argparse.ArgumentParser(description='search for current 4chan posts that contain some string')
parser.add_argument('-v', '--verbose', action='store_true', help='verbose')
parser.add_argument('-z', '--fuzzy-ratio', action='store', help='(0-100) minimum fuzzy match ratio (default: 60)')
parser.add_argument('--strict', action='store_true', default = False, help='use stricter fuzzy matching (ratio: 80)')
parser.add_argument('--loose', action='store_true', default = False, help='Use looser fuzzy matching (ratio: 40)')
parser.add_argument('--exact-only', action='store_true', default = False, help='Disable fuzzy search, use exact matching only')
parser.add_argument('query', help='the search word')
parser.add_argument('--board', '-b', default = '', help="Choose a board to limit your query to")
args = parser.parse_args()
config = FuzzySearchConfig()
if args.fuzzy_ratio:
config.min_ratio = args.fuzzy_ratio
return args.query, args.board, config
#####################################################################
def main():
# Parse command line arguments
wod, bod, search_config = parseargs()
results_url = [] #URLs of threads containing the keyword
results_content = [] #text of all posts and comments containing the keyword
results_img = [] #URLs of all images containing the keyword
print(f'Fuzzy searching for "{wod}" on "{bod}" (min ratio: {search_config.min_ratio})')
if bod == '':
print('searching all boards')
repl_res = []
for b in bods:
print(f'Processing board /{b}/')
try:
res = requests.get("https://a.4cdn.org/"+b+"/catalog.json")
if res and res.text != None:
#get each thread from each page,
pages = json.loads(res.text) #each page has page # and threads array
threads = []
for p in pages:
for t in p['threads']:
processThread(t) #TODO https://github.com/seanpm2001/4Chan_4Chan-API/blob/master/pages/Endpoints_and_domains.md
repl_res.append(repliesSort(json.loads(res.text)))
processCatalog(json.loads(res.text), b, search_config)
else:
print(f'Error getting response from API, board /{b}/.')
except Exception as e:
print(f'Error processing board /{b}/: {e}')
repl_res.sort(key=lambda v: v[1])
print(repl_res)
else:
print('searching board ' + bod)
try:
res = requests.get("https://a.4cdn.org/"+bod+"/catalog.json")
print(repliesSort(json.loads(res.text)))
processCatalog(json.loads(res.text), bod, search_config)
except Exception as e:
print(f'Error processing board /{bod}/: {e}')
print(f'\nFound {len(results_url)} matches:')
for url in results_url:
print(url)
# Optional: Show some sample matches with their similarity scores
if results_content and search_config.min_ratio < 100:
print(f'\nSample fuzzy matches for "{wod}":')
for i, content in enumerate(results_content[:5]): # Show first 5 matches
# Strip HTML and limit length for display
clean_content = content.replace('<br>', ' ').replace('>', '>').replace('<', '<')
if len(clean_content) > 100:
clean_content = clean_content[:100] + "..."
score = fuzz.partial_ratio(wod.lower(), content.lower())
print(f'[{score}%] {clean_content}')
if __name__ == '__main__':
main()
|