5 files changed, 384 insertions, 0 deletions
diff --git a/TODO.txt b/TODO.txt
new file mode 100755
index 0000000..3c2702d
--- /dev/null
+++ b/TODO.txt
@@ -0,0 +1 @@
+make wwwimgpull into a general web crawl tool
diff --git a/pullimgs.py b/pullimgs.py
new file mode 100755
index 0000000..af09c4c
--- /dev/null
+++ b/pullimgs.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python
+
+import json
+import requests
+import sqlite3
+import sys
+from wwwimgpull import *
+#####################################################################
+
+if (len(sys.argv) < 2):
+    print('This program will download all of the images on a given 4chan thread. provide URL. ')
+    print('you must provide a search word, or \"*\" for any word.')
+    print('Usage: ./pullimgs.py <url>')
+    sys.exit(0)
+
+url = sys.argv[1]
+for imgurl in pull4chImgs(url):
+    print(imgurl)
diff --git a/query.py b/query.py
new file mode 100755
index 0000000..9871fbd
--- /dev/null
+++ b/query.py
@@ -0,0 +1,209 @@
+#!/usr/bin/python3
+
+import json
+import requests
+import sqlite3
+import sys
+from difflib import SequenceMatcher
+from fuzzywuzzy import fuzz, process
+from wwwimgpull import *
+import argparse
+
+bods = ['a', 'c', 'w', 'm', 'cgl', 'cm', 'f', 'n', 'jp', 'vp', 'v', 'vg', 'vr', 'co', 'g', 'tv', 'k', 'o', 'an', 'tg', 'sp', 'asp', 'sci', 'int', 'out', 'toy', 'biz', 'i', 'po', 'p', 'ck', 'ic', 'wg', 'mu', 'fa', '3', 'gd', 'diy', 'wsg', 's', 'trv', 'fit', 'x', 'lit', 'adv', 'lgbt', 'mlp', 'b', 'r', 'r9k', 'pol', 'soc', 's4s']
+abods = ['hc', 'hm', 'h', 'e', 'u', 'd', 'y', 't', 'hr', 'gif']
+
+class FuzzySearchConfig:
+    def __init__(self):
+        self.min_ratio = 60  # Minimum similarity ratio (0-100)
+        self.partial_ratio_weight = 0.7
+        self.token_sort_weight = 0.3
+        self.enable_partial = True
+        self.enable_token_sort = True
+
+def fuzzy_match(search_term, text, config):
+    """
+    Performs fuzzy matching with configurable algorithms
+    Returns True if match is found, False otherwise
+    """
+    if not text or not search_term:
+        return False
+    
+    # Handle wildcard
+    if search_term == "*" or search_term == "":
+        return True
+    
+    search_lower = search_term.lower()
+    text_lower = text.lower()
+    
+    # Exact match (highest priority)
+    if search_lower in text_lower:
+        return True
+    
+    # Fuzzy matching using different algorithms
+    scores = []
+    
+    # Basic ratio
+    basic_ratio = fuzz.ratio(search_lower, text_lower)
+    scores.append(basic_ratio)
+    
+    # Partial ratio (good for substring matching)
+    if config.enable_partial:
+        partial_ratio = fuzz.partial_ratio(search_lower, text_lower)
+        scores.append(partial_ratio * config.partial_ratio_weight)
+    
+    # Token sort ratio (good for word order differences)
+    if config.enable_token_sort:
+        token_sort_ratio = fuzz.token_sort_ratio(search_lower, text_lower)
+        scores.append(token_sort_ratio * config.token_sort_weight)
+    
+    # Token set ratio (handles duplicates and order)
+    token_set_ratio = fuzz.token_set_ratio(search_lower, text_lower)
+    scores.append(token_set_ratio)
+    
+    # Use the best score
+    best_score = max(scores) if scores else 0
+    
+    return best_score >= config.min_ratio
+
+def processCatalog(catalog, b, search_config):
+    for i in range(0, len(catalog)): #each page of the board
+        for j in range(0, len(catalog[i]['threads'])): #each OP on the page
+            if not 'com' in catalog[i]['threads'][j]:
+                continue
+            url = "https://boards.4channel.org/"+b+"/thread/"+str(catalog[i]['threads'][j]['no'])
+            
+            # Use fuzzy matching instead of simple string containment
+            if fuzzy_match(wod, catalog[i]['threads'][j]['com'], search_config):
+                results_url.append((url, catalog[i]['threads'][j]['last_modified']))
+                results_content.append(catalog[i]['threads'][j]['com'])
+                for imgurl in pull4chImgs(url):
+                    results_img.append(imgurl)
+            
+            if not 'last_replies' in catalog[i]['threads'][j]:
+                continue
+            for k in range(0, len(catalog[i]['threads'][j]['last_replies'])): #each comment on the OP
+                r = catalog[i]['threads'][j]['last_replies'][k]
+                if not 'com' in r:
+                        continue
+                
+                # Use fuzzy matching for replies too
+                if fuzzy_match(wod, r['com'], search_config):
+                    results_url.append((url+"#p"+str(catalog[i]['threads'][j]['last_replies'][k]['no'])))
+                    results_content.append(catalog[i]['threads'][j]['last_replies'][k]['com'])
+                    #imgs were already retrieved from OP grab
+
+#def processThread(thread):
+    
+
+def repliesSort(catalog):
+    result = []
+    for i in range(0, len(catalog)):
+        for j in range(0, len(catalog[i]['threads'])):
+            url = "https://boards.4channel.org/"+bod+"/thread/"+str(catalog[i]['threads'][j]['no']) 
+            result.append((url, catalog[i]['threads'][j]['replies']))
+        
+    result.sort(key=lambda v: v[1])
+    return result
+
+def print_usage():
+    print('This program will give you the URLs of all 4chan posts that contain the given search word, either on the entire site or on a select board.')
+    print('It supports fuzzy search for typos and partial matches.')
+    print('Usage: ./query.py <searchword> [board] [options]')
+    print('Options:')
+    print('  --fuzzy-ratio <0-100>    Set minimum fuzzy match ratio (default: 60)')
+    print('  --exact-only             Disable fuzzy search, use exact matching only')
+    print('  --strict                 Use stricter fuzzy matching (ratio: 80)')
+    print('  --loose                  Use looser fuzzy matching (ratio: 40)')
+    print('Examples:')
+    print('  ./query.py "programming" g          # Search for "programming" on /g/')
+    print('  ./query.py "programing" g --fuzzy   # Will also match "programming"')
+    print('  ./query.py "linux" --strict         # Strict matching across all boards')
+
+def parseargs():
+    global v
+    global wod
+    global bod
+    parser = argparse.ArgumentParser(description='search for current 4chan posts that contain some string')
+    parser.add_argument('-v', '--verbose', action='store_true', help='verbose')    
+    parser.add_argument('-z', '--fuzzy-ratio', action='store', help='(0-100) minimum fuzzy match ratio (default: 60)')
+    parser.add_argument('--strict', action='store_true', default = False, help='use stricter fuzzy matching (ratio: 80)')
+    parser.add_argument('--loose', action='store_true', default = False, help='Use looser fuzzy matching (ratio: 40)')
+    parser.add_argument('--exact-only', action='store_true', default = False, help='Disable fuzzy search, use exact matching only')
+    parser.add_argument('query', help='the search word')
+    parser.add_argument('--board', '-b', default = '', help="Choose a board to limit your query to")
+    args = parser.parse_args()
+
+    config = FuzzySearchConfig()
+    
+    if args.fuzzy_ratio:
+        config.min_ratio = args.fuzzy_ratio
+
+    
+    return args.query, args.board, config
+
+
+#####################################################################
+
+def main():
+    # Parse command line arguments
+    wod, bod, search_config = parseargs()
+
+    results_url = [] #URLs of threads containing the keyword
+    results_content = [] #text of all posts and comments containing the keyword
+    results_img = [] #URLs of all images containing the keyword
+
+    print(f'Fuzzy searching for "{wod}" on  "{bod}" (min ratio: {search_config.min_ratio})')
+
+
+
+    if bod == '':
+        print('searching all boards')
+        repl_res = []
+        for b in bods:
+            print(f'Processing board /{b}/')
+            try:
+                res = requests.get("https://a.4cdn.org/"+b+"/catalog.json")
+                if res and res.text != None:    
+                    #get each thread from each page,
+                    pages = json.loads(res.text) #each page has page # and threads array
+                    threads = []
+                    for p in pages:
+                        for t in p['threads']:
+                            processThread(t) #TODO https://github.com/seanpm2001/4Chan_4Chan-API/blob/master/pages/Endpoints_and_domains.md
+                        
+                    
+                    repl_res.append(repliesSort(json.loads(res.text)))
+                    processCatalog(json.loads(res.text), b, search_config)
+                else:
+                    print(f'Error getting response from API, board /{b}/.')
+            except Exception as e:
+                print(f'Error processing board /{b}/: {e}')
+        repl_res.sort(key=lambda v: v[1])
+        print(repl_res)
+    else:
+        print('searching board ' + bod)
+        try:
+            res = requests.get("https://a.4cdn.org/"+bod+"/catalog.json")
+            print(repliesSort(json.loads(res.text)))
+            processCatalog(json.loads(res.text), bod, search_config)
+        except Exception as e:
+            print(f'Error processing board /{bod}/: {e}')
+
+    print(f'\nFound {len(results_url)} matches:')
+    for url in results_url:
+        print(url)
+
+# Optional: Show some sample matches with their similarity scores
+    if results_content and search_config.min_ratio < 100:
+        print(f'\nSample fuzzy matches for "{wod}":')
+        for i, content in enumerate(results_content[:5]):  # Show first 5 matches
+            # Strip HTML and limit length for display
+            clean_content = content.replace('<br>', ' ').replace('&gt;', '>').replace('&lt;', '<')
+            if len(clean_content) > 100:
+                clean_content = clean_content[:100] + "..."
+            
+            score = fuzz.partial_ratio(wod.lower(), content.lower())
+            print(f'[{score}%] {clean_content}')
+
+if __name__ == '__main__':
+    main()
diff --git a/search4c_oldhb.py b/search4c_oldhb.py
new file mode 100755
index 0000000..bed39b4
--- /dev/null
+++ b/search4c_oldhb.py
@@ -0,0 +1,65 @@
+#!/usr/bin/python3
+
+import json
+import requests
+import sqlite3
+import sys
+from wwwimgpull import *
+
+
+def processCatalog(catalog, b):
+
+    for i in range(0, len(catalog)): #each page of the board
+        for j in range(0, len(catalog[i]['threads'])): #each OP on the page
+            if not 'com' in catalog[i]['threads'][j]:
+                continue
+            url = "https://boards.4channel.org/"+b+"/thread/"+str(catalog[i]['threads'][j]['no'])
+            if wod == "*" or wod == "" or wod.lower() in catalog[i]['threads'][j]['com'].lower():
+                results_url.append(url)
+                results_content.append(catalog[i]['threads'][j]['com'])
+                for imgurl in pull4chImgs(url):
+                    results_img.append(imgurl)
+            if not 'last_replies' in catalog[i]['threads'][j]:
+                continue
+            for k in range(0, len(catalog[i]['threads'][j]['last_replies'])): #each comment on the OP
+                r = catalog[i]['threads'][j]['last_replies'][k]
+                if not 'com' in r:
+                        continue
+                if wod.lower() in r['com'].lower():
+                    results_url.append(url+"#p"+str(catalog[i]['threads'][j]['last_replies'][k]['no']))
+                    results_content.append(catalog[i]['threads'][j]['last_replies'][k]['com'])
+                    #imgs were already retrieved from OP grab
+
+bods = ['a', 'c', 'w', 'm', 'cgl', 'cm', 'f', 'n', 'jp', 'vp', 'v', 'vg', 'vr', 'co', 'g', 'tv', 'k', 'o', 'an', 'tg', 'sp', 'asp', 'sci', 'int', 'out', 'toy', 'biz', 'i', 'po', 'p', 'ck', 'ic', 'wg', 'mu', 'fa', '3', 'gd', 'diy', 'wsg', 's', 'trv', 'fit', 'x', 'lit', 'adv', 'lgbt', 'mlp', 'b', 'r', 'r9k', 'pol', 'soc', 's4s']
+abods = ['hc', 'hm', 'h', 'e', 'u', 'd', 'y', 't', 'hr', 'gif']
+
+
+#####################################################################
+
+
+if len(sys.argv) < 2 or sys.argv[1][0] == '-':
+    print('This program will give you the URLs of all 4chan posts that contain the given search word, either on the entire site or on a select board. You can also use wildcard as search word for all posts of board/site')
+    print('you must provide a search word, or \"*\" for any word.')
+    print('Usage: ./query.py <searchword> [board]')
+    sys.exit(0)
+
+wod = sys.argv[1] #the search keyword
+bod = ''
+if len(sys.argv) > 2:
+    bod = sys.argv[2] #the board if interest, if given
+
+results_url = [] #URLs of threads containing the keyword
+results_content = [] #text of all posts and comments containing the keyword
+results_img = [] #URLs of all images containing the keyword
+
+if bod == '':
+    for b in bods:
+        res = requests.get("https://a.4cdn.org/"+b+"/catalog.json")
+        processCatalog(json.loads(res.text), b)
+else:
+    res = requests.get("https://a.4cdn.org/"+bod+"/catalog.json")
+    processCatalog(json.loads(res.text), bod)
+
+
+for url in results_url:
+    print(url)
diff --git a/wwwimgpull.py b/wwwimgpull.py
new file mode 100755
index 0000000..6f341c5
--- /dev/null
+++ b/wwwimgpull.py
@@ -0,0 +1,91 @@
+#!/bin/python
+
+import requests
+from bs4 import BeautifulSoup
+import re
+import sys
+import time
+
+###
+# get images from websites
+#    will probably be updated to include other data
+###
+
+#returns an array of img src urls from a 4chan thread
+def pull4chImgs(url):
+    result = []
+    resp = requests.get(url)
+    html = BeautifulSoup(resp.text, 'html.parser')
+    for a in html.find_all('a'):
+        if 'class' in a.attrs and 'fileThumb' in a.attrs['class']:
+            url = a.get('href')
+            if url[0:2] == '//':
+                url = 'https:' + url
+            result.append(url)
+    return result
+
+def pullVids(url):
+    result = []
+    resp = requests.get(url)
+    html = BeautifulSoup(resp.text, 'html.parser')
+    for a in html.find_all('a'):
+        if '.webm' in a.get('href'):
+            result.append(a.get('href'))
+    return result
+
+def pullImgs(url):
+    result = []
+    resp = requests.get(url)
+    html = BeautifulSoup(resp.text, 'html.parser')
+    for img in html.find_all('img'):
+        srcURL = img.get('src')
+        result.append(srcURL)
+    return result
+
+#def pullPDFs(url):
+#    return pullPDFs(url, 0)
+
+def pullPDFs(url, depth=0, alreadycrawled=[]):
+    if depth > 5 or url == '' or url is None or url in alreadycrawled: 
+        return [] 
+    baseurl=url[0:url.find('/', 8)+1]
+    result = []
+    print(url)
+    resp = requests.get(url)
+    html = BeautifulSoup(resp.text, 'html.parser')
+    alreadycrawled.append(url)
+    for a in html.find_all('a'):
+        url = a.get('href')
+        if url is None or url == '' or url == '/':
+            continue
+        if baseurl not in url and 'http' in url[0:4]: #this means that the url is pointing to external site
+            continue
+        print('found ' + url)
+        if 'http' not in url:
+            url = baseurl+url
+        if url.find('.pdf')>0 and os.path.isfile(url):
+            result.append(url)
+        else:
+            time.sleep(5)
+            result = result + pullPDFs(url, depth+1, alreadycrawled)
+    return result
+
+#return a list of strings of all the links on the given webpage (<a> elements) whose href contains the given search string
+def getLinksContainingStr(url, s):
+    result = []
+    resp = requests.get(url)
+    html = BeautifulSoup(resp.text, 'html.parser')
+    for link in html.find_all('a'):
+        h = link.get('href')
+        if s in h:
+            result.append(h)
+    return result
+
+
+
+#if len(sys.argv) < 2:
+#    sys.exit(0)
+
+#for url in pull4chImgs(sys.argv[1]):
+#    print(url)
+