#!/usr/bin/python3 import json import requests import sqlite3 import sys from difflib import SequenceMatcher from fuzzywuzzy import fuzz, process from wwwimgpull import * import argparse bods = ['a', 'c', 'w', 'm', 'cgl', 'cm', 'f', 'n', 'jp', 'vp', 'v', 'vg', 'vr', 'co', 'g', 'tv', 'k', 'o', 'an', 'tg', 'sp', 'asp', 'sci', 'int', 'out', 'toy', 'biz', 'i', 'po', 'p', 'ck', 'ic', 'wg', 'mu', 'fa', '3', 'gd', 'diy', 'wsg', 's', 'trv', 'fit', 'x', 'lit', 'adv', 'lgbt', 'mlp', 'b', 'r', 'r9k', 'pol', 'soc', 's4s'] abods = ['hc', 'hm', 'h', 'e', 'u', 'd', 'y', 't', 'hr', 'gif'] class FuzzySearchConfig: def __init__(self): self.min_ratio = 60 # Minimum similarity ratio (0-100) self.partial_ratio_weight = 0.7 self.token_sort_weight = 0.3 self.enable_partial = True self.enable_token_sort = True def fuzzy_match(search_term, text, config): """ Performs fuzzy matching with configurable algorithms Returns True if match is found, False otherwise """ if not text or not search_term: return False # Handle wildcard if search_term == "*" or search_term == "": return True search_lower = search_term.lower() text_lower = text.lower() # Exact match (highest priority) if search_lower in text_lower: return True # Fuzzy matching using different algorithms scores = [] # Basic ratio basic_ratio = fuzz.ratio(search_lower, text_lower) scores.append(basic_ratio) # Partial ratio (good for substring matching) if config.enable_partial: partial_ratio = fuzz.partial_ratio(search_lower, text_lower) scores.append(partial_ratio * config.partial_ratio_weight) # Token sort ratio (good for word order differences) if config.enable_token_sort: token_sort_ratio = fuzz.token_sort_ratio(search_lower, text_lower) scores.append(token_sort_ratio * config.token_sort_weight) # Token set ratio (handles duplicates and order) token_set_ratio = fuzz.token_set_ratio(search_lower, text_lower) scores.append(token_set_ratio) # Use the best score best_score = max(scores) if scores else 0 return best_score >= config.min_ratio def processCatalog(catalog, b, search_config): for i in range(0, len(catalog)): #each page of the board for j in range(0, len(catalog[i]['threads'])): #each OP on the page if not 'com' in catalog[i]['threads'][j]: continue url = "https://boards.4channel.org/"+b+"/thread/"+str(catalog[i]['threads'][j]['no']) # Use fuzzy matching instead of simple string containment if fuzzy_match(wod, catalog[i]['threads'][j]['com'], search_config): results_url.append((url, catalog[i]['threads'][j]['last_modified'])) results_content.append(catalog[i]['threads'][j]['com']) for imgurl in pull4chImgs(url): results_img.append(imgurl) if not 'last_replies' in catalog[i]['threads'][j]: continue for k in range(0, len(catalog[i]['threads'][j]['last_replies'])): #each comment on the OP r = catalog[i]['threads'][j]['last_replies'][k] if not 'com' in r: continue # Use fuzzy matching for replies too if fuzzy_match(wod, r['com'], search_config): results_url.append((url+"#p"+str(catalog[i]['threads'][j]['last_replies'][k]['no']))) results_content.append(catalog[i]['threads'][j]['last_replies'][k]['com']) #imgs were already retrieved from OP grab #def processThread(thread): def repliesSort(catalog): result = [] for i in range(0, len(catalog)): for j in range(0, len(catalog[i]['threads'])): url = "https://boards.4channel.org/"+bod+"/thread/"+str(catalog[i]['threads'][j]['no']) result.append((url, catalog[i]['threads'][j]['replies'])) result.sort(key=lambda v: v[1]) return result def print_usage(): print('This program will give you the URLs of all 4chan posts that contain the given search word, either on the entire site or on a select board.') print('It supports fuzzy search for typos and partial matches.') print('Usage: ./query.py [board] [options]') print('Options:') print(' --fuzzy-ratio <0-100> Set minimum fuzzy match ratio (default: 60)') print(' --exact-only Disable fuzzy search, use exact matching only') print(' --strict Use stricter fuzzy matching (ratio: 80)') print(' --loose Use looser fuzzy matching (ratio: 40)') print('Examples:') print(' ./query.py "programming" g # Search for "programming" on /g/') print(' ./query.py "programing" g --fuzzy # Will also match "programming"') print(' ./query.py "linux" --strict # Strict matching across all boards') def parseargs(): global v global wod global bod parser = argparse.ArgumentParser(description='search for current 4chan posts that contain some string') parser.add_argument('-v', '--verbose', action='store_true', help='verbose') parser.add_argument('-z', '--fuzzy-ratio', action='store', help='(0-100) minimum fuzzy match ratio (default: 60)') parser.add_argument('--strict', action='store_true', default = False, help='use stricter fuzzy matching (ratio: 80)') parser.add_argument('--loose', action='store_true', default = False, help='Use looser fuzzy matching (ratio: 40)') parser.add_argument('--exact-only', action='store_true', default = False, help='Disable fuzzy search, use exact matching only') parser.add_argument('query', help='the search word') parser.add_argument('--board', '-b', default = '', help="Choose a board to limit your query to") args = parser.parse_args() config = FuzzySearchConfig() if args.fuzzy_ratio: config.min_ratio = args.fuzzy_ratio return args.query, args.board, config ##################################################################### def main(): # Parse command line arguments wod, bod, search_config = parseargs() results_url = [] #URLs of threads containing the keyword results_content = [] #text of all posts and comments containing the keyword results_img = [] #URLs of all images containing the keyword print(f'Fuzzy searching for "{wod}" on "{bod}" (min ratio: {search_config.min_ratio})') if bod == '': print('searching all boards') repl_res = [] for b in bods: print(f'Processing board /{b}/') try: res = requests.get("https://a.4cdn.org/"+b+"/catalog.json") if res and res.text != None: #get each thread from each page, pages = json.loads(res.text) #each page has page # and threads array threads = [] for p in pages: for t in p['threads']: processThread(t) #TODO https://github.com/seanpm2001/4Chan_4Chan-API/blob/master/pages/Endpoints_and_domains.md repl_res.append(repliesSort(json.loads(res.text))) processCatalog(json.loads(res.text), b, search_config) else: print(f'Error getting response from API, board /{b}/.') except Exception as e: print(f'Error processing board /{b}/: {e}') repl_res.sort(key=lambda v: v[1]) print(repl_res) else: print('searching board ' + bod) try: res = requests.get("https://a.4cdn.org/"+bod+"/catalog.json") print(repliesSort(json.loads(res.text))) processCatalog(json.loads(res.text), bod, search_config) except Exception as e: print(f'Error processing board /{bod}/: {e}') print(f'\nFound {len(results_url)} matches:') for url in results_url: print(url) # Optional: Show some sample matches with their similarity scores if results_content and search_config.min_ratio < 100: print(f'\nSample fuzzy matches for "{wod}":') for i, content in enumerate(results_content[:5]): # Show first 5 matches # Strip HTML and limit length for display clean_content = content.replace('
', ' ').replace('>', '>').replace('<', '<') if len(clean_content) > 100: clean_content = clean_content[:100] + "..." score = fuzz.partial_ratio(wod.lower(), content.lower()) print(f'[{score}%] {clean_content}') if __name__ == '__main__': main()