summaryrefslogtreecommitdiff
path: root/query.py
blob: 9871fbd21b5dba91b74ffbbeb376db23ebdc142f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/python3

import json
import requests
import sqlite3
import sys
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz, process
from wwwimgpull import *
import argparse

bods = ['a', 'c', 'w', 'm', 'cgl', 'cm', 'f', 'n', 'jp', 'vp', 'v', 'vg', 'vr', 'co', 'g', 'tv', 'k', 'o', 'an', 'tg', 'sp', 'asp', 'sci', 'int', 'out', 'toy', 'biz', 'i', 'po', 'p', 'ck', 'ic', 'wg', 'mu', 'fa', '3', 'gd', 'diy', 'wsg', 's', 'trv', 'fit', 'x', 'lit', 'adv', 'lgbt', 'mlp', 'b', 'r', 'r9k', 'pol', 'soc', 's4s']
abods = ['hc', 'hm', 'h', 'e', 'u', 'd', 'y', 't', 'hr', 'gif']

class FuzzySearchConfig:
    def __init__(self):
        self.min_ratio = 60  # Minimum similarity ratio (0-100)
        self.partial_ratio_weight = 0.7
        self.token_sort_weight = 0.3
        self.enable_partial = True
        self.enable_token_sort = True

def fuzzy_match(search_term, text, config):
    """
    Performs fuzzy matching with configurable algorithms
    Returns True if match is found, False otherwise
    """
    if not text or not search_term:
        return False
    
    # Handle wildcard
    if search_term == "*" or search_term == "":
        return True
    
    search_lower = search_term.lower()
    text_lower = text.lower()
    
    # Exact match (highest priority)
    if search_lower in text_lower:
        return True
    
    # Fuzzy matching using different algorithms
    scores = []
    
    # Basic ratio
    basic_ratio = fuzz.ratio(search_lower, text_lower)
    scores.append(basic_ratio)
    
    # Partial ratio (good for substring matching)
    if config.enable_partial:
        partial_ratio = fuzz.partial_ratio(search_lower, text_lower)
        scores.append(partial_ratio * config.partial_ratio_weight)
    
    # Token sort ratio (good for word order differences)
    if config.enable_token_sort:
        token_sort_ratio = fuzz.token_sort_ratio(search_lower, text_lower)
        scores.append(token_sort_ratio * config.token_sort_weight)
    
    # Token set ratio (handles duplicates and order)
    token_set_ratio = fuzz.token_set_ratio(search_lower, text_lower)
    scores.append(token_set_ratio)
    
    # Use the best score
    best_score = max(scores) if scores else 0
    
    return best_score >= config.min_ratio

def processCatalog(catalog, b, search_config):
    for i in range(0, len(catalog)): #each page of the board
        for j in range(0, len(catalog[i]['threads'])): #each OP on the page
            if not 'com' in catalog[i]['threads'][j]:
                continue
            url = "https://boards.4channel.org/"+b+"/thread/"+str(catalog[i]['threads'][j]['no'])
            
            # Use fuzzy matching instead of simple string containment
            if fuzzy_match(wod, catalog[i]['threads'][j]['com'], search_config):
                results_url.append((url, catalog[i]['threads'][j]['last_modified']))
                results_content.append(catalog[i]['threads'][j]['com'])
                for imgurl in pull4chImgs(url):
                    results_img.append(imgurl)
            
            if not 'last_replies' in catalog[i]['threads'][j]:
                continue
            for k in range(0, len(catalog[i]['threads'][j]['last_replies'])): #each comment on the OP
                r = catalog[i]['threads'][j]['last_replies'][k]
                if not 'com' in r:
                        continue
                
                # Use fuzzy matching for replies too
                if fuzzy_match(wod, r['com'], search_config):
                    results_url.append((url+"#p"+str(catalog[i]['threads'][j]['last_replies'][k]['no'])))
                    results_content.append(catalog[i]['threads'][j]['last_replies'][k]['com'])
                    #imgs were already retrieved from OP grab

#def processThread(thread):
    

def repliesSort(catalog):
    result = []
    for i in range(0, len(catalog)):
        for j in range(0, len(catalog[i]['threads'])):
            url = "https://boards.4channel.org/"+bod+"/thread/"+str(catalog[i]['threads'][j]['no']) 
            result.append((url, catalog[i]['threads'][j]['replies']))
        
    result.sort(key=lambda v: v[1])
    return result

def print_usage():
    print('This program will give you the URLs of all 4chan posts that contain the given search word, either on the entire site or on a select board.')
    print('It supports fuzzy search for typos and partial matches.')
    print('Usage: ./query.py <searchword> [board] [options]')
    print('Options:')
    print('  --fuzzy-ratio <0-100>    Set minimum fuzzy match ratio (default: 60)')
    print('  --exact-only             Disable fuzzy search, use exact matching only')
    print('  --strict                 Use stricter fuzzy matching (ratio: 80)')
    print('  --loose                  Use looser fuzzy matching (ratio: 40)')
    print('Examples:')
    print('  ./query.py "programming" g          # Search for "programming" on /g/')
    print('  ./query.py "programing" g --fuzzy   # Will also match "programming"')
    print('  ./query.py "linux" --strict         # Strict matching across all boards')

def parseargs():
    global v
    global wod
    global bod
    parser = argparse.ArgumentParser(description='search for current 4chan posts that contain some string')
    parser.add_argument('-v', '--verbose', action='store_true', help='verbose')    
    parser.add_argument('-z', '--fuzzy-ratio', action='store', help='(0-100) minimum fuzzy match ratio (default: 60)')
    parser.add_argument('--strict', action='store_true', default = False, help='use stricter fuzzy matching (ratio: 80)')
    parser.add_argument('--loose', action='store_true', default = False, help='Use looser fuzzy matching (ratio: 40)')
    parser.add_argument('--exact-only', action='store_true', default = False, help='Disable fuzzy search, use exact matching only')
    parser.add_argument('query', help='the search word')
    parser.add_argument('--board', '-b', default = '', help="Choose a board to limit your query to")
    args = parser.parse_args()

    config = FuzzySearchConfig()
    
    if args.fuzzy_ratio:
        config.min_ratio = args.fuzzy_ratio

    
    return args.query, args.board, config


#####################################################################

def main():
    # Parse command line arguments
    wod, bod, search_config = parseargs()

    results_url = [] #URLs of threads containing the keyword
    results_content = [] #text of all posts and comments containing the keyword
    results_img = [] #URLs of all images containing the keyword

    print(f'Fuzzy searching for "{wod}" on  "{bod}" (min ratio: {search_config.min_ratio})')



    if bod == '':
        print('searching all boards')
        repl_res = []
        for b in bods:
            print(f'Processing board /{b}/')
            try:
                res = requests.get("https://a.4cdn.org/"+b+"/catalog.json")
                if res and res.text != None:    
                    #get each thread from each page,
                    pages = json.loads(res.text) #each page has page # and threads array
                    threads = []
                    for p in pages:
                        for t in p['threads']:
                            processThread(t) #TODO https://github.com/seanpm2001/4Chan_4Chan-API/blob/master/pages/Endpoints_and_domains.md
                        
                    
                    repl_res.append(repliesSort(json.loads(res.text)))
                    processCatalog(json.loads(res.text), b, search_config)
                else:
                    print(f'Error getting response from API, board /{b}/.')
            except Exception as e:
                print(f'Error processing board /{b}/: {e}')
        repl_res.sort(key=lambda v: v[1])
        print(repl_res)
    else:
        print('searching board ' + bod)
        try:
            res = requests.get("https://a.4cdn.org/"+bod+"/catalog.json")
            print(repliesSort(json.loads(res.text)))
            processCatalog(json.loads(res.text), bod, search_config)
        except Exception as e:
            print(f'Error processing board /{bod}/: {e}')

    print(f'\nFound {len(results_url)} matches:')
    for url in results_url:
        print(url)

# Optional: Show some sample matches with their similarity scores
    if results_content and search_config.min_ratio < 100:
        print(f'\nSample fuzzy matches for "{wod}":')
        for i, content in enumerate(results_content[:5]):  # Show first 5 matches
            # Strip HTML and limit length for display
            clean_content = content.replace('<br>', ' ').replace('&gt;', '>').replace('&lt;', '<')
            if len(clean_content) > 100:
                clean_content = clean_content[:100] + "..."
            
            score = fuzz.partial_ratio(wod.lower(), content.lower())
            print(f'[{score}%] {clean_content}')

if __name__ == '__main__':
    main()