#!/usr/bin/python3 import argparse import sys import time import os import random import subprocess import chardet import filetype import re #Words Of Wisdom # output some random text from some given collection of files, # - primarily used to grab some random "words of wisdom" from my journals and writings # paths=[] #the paths to scan recursively for files from which to grab text #samplefiles=[] #the paths of all the individual files from which we can grab text matchpattern='' #if we want to filter the files by some text pattern that the filename must match time_min = -1 #threshold time. dont use files that are older v=False exclude_patterns_default = ['.git', 'node_modules', 'vendor', '\.~lock'] #default patterns to exclude def attemptReadSampleFile(filepath): if v: print('checking {}'.format(filepath)) if os.path.isfile(filepath): try: with open(filepath, 'rb') as f: ftype = filetype.guess(filepath) if v: print('filetype: {}'.format(str(ftype))) if ftype is not None: if ftype.extension in ['py', 'c', 'cc', 'h', 'hh', 'java', 'rst', 'css', 'html', 'htm', 'js', 'php', 'sh']: #don't want code in the sample data if v: print('this file is code') return None if ftype.extension == 'odt' and filepath[-1] != '#': #openoffice doc and not a lock file subproc = subprocess.run(['odt2txt', filepath], encoding='utf-8', stdout=subprocess.PIPE) return subproc.stdout if ftype.extension == 'txt': return str(f.read(), encoding='utf-8') else: fb = f.read() #file data (bytes) to detect encoding enc = str(chardet.detect(fb)['encoding']) if v: print('encoding: {}'.format(enc)) if enc in ['ascii', 'utf-8']: return str(fb, encoding=enc) else: return None except Exception as e: print(f"Error reading file {filepath}: {e}") return None else: if v: print('not a file') return None def getSampleFiles(paths, exclude_patterns = []): """grab all the possible sample files (recursive files from given paths)""" samples = [] # Compile exclusion patterns into a single regex for efficiency compiled_exclusions = None exclude_patterns = exclude_patterns + exclude_patterns_default # Join patterns with OR operator and compile once pattern = '|'.join(f'({pattern})' for pattern in exclude_patterns) compiled_exclusions = re.compile(pattern, re.IGNORECASE) tStart = time.time() for p in paths: if v: print('path {}'.format(p)) if os.path.isdir(p): for root, dirs, files in os.walk(p): if compiled_exclusions: dirs[:] = [d for d in dirs if not compiled_exclusions.search(d)] if v: print('walk {}: {} files, {} dirs'.format(root, len(files), len(dirs))) for f in files: filepath = os.path.join(root, f) # Check if file matches any exclusion pattern if compiled_exclusions and compiled_exclusions.search(f): if v: print(f'excluding file: {filepath}') continue samples.append(filepath) else: # For individual files, still check exclusion if compiled_exclusions and compiled_exclusions.search(os.path.basename(p)): if v: print(f'excluding file: {p}') else: samples.append(p) tEnd = time.time() tDuration = tEnd - tStart if v: print('gathered {} candidate files in {} seconds, from paths {}'.format(len(samples), tDuration, str(paths))) return samples def getRandomFileTextContent(samplefiles): """pick random file until we get an acceptable one @return tuple(filename, textcontent)""" fi = random.randint(0, len(samplefiles)) if v: print('candidate file: {}'.format(samplefiles[fi])) t = attemptReadSampleFile(samplefiles[fi]) #check if valid file while t == None: #this file was invalid, try again del samplefiles[fi] fi = random.randint(0, len(samplefiles)) t = attemptReadSampleFile(samplefiles[fi]) if len(samplefiles) == 0: #no files are valid return (None, None) mt = time.ctime(os.path.getmtime(samplefiles[fi])) if v: print('{} ;\n last modified {} :\n '.format(samplefiles[fi], mt)) return (samplefiles[fi], t) #TODO params def getExcerpt(text): """grab a random excerpt from the given stringh, which is assumed to be at least one paragraph length with multiple lines """ offset = random.randint(0,5) lines = text.splitlines() if offset >= len(lines)/2: return text li = random.randint(offset, len(lines)-offset) #line index res = '\n'.join(lines[li-offset: li+offset]) if len(res) < 7: res = '\n'.join(lines) return res def parseArgs(): global v #get args from cmd line parser = argparse.ArgumentParser(description='output some random text from some given collection of files') parser.add_argument('-v', '--verbose', action='store_true', help='verbose') parser.add_argument('-p', '--paths', type=str, required=False, help='a path to scan', action='append', default=[]) parser.add_argument('-o', '--output', type=str, required=False, help='output file') parser.add_argument('-x', '--exclude', type=str, required=False, help='exclude files with pattern', action='append') args = parser.parse_args() if (args.verbose): v = True if len(args.paths) == 0: args.paths.append('./') if v: print('using paths: {}'.format(args.paths)) if args.exclude: print('excluding: {}'.format(str(args.exclude))) return args def main(): args = parseArgs() samplefiles = getSampleFiles(args.paths, args.exclude) chosenfile = getRandomFileTextContent(samplefiles) excerpt = getExcerpt(chosenfile[1]) print(excerpt) print('\n - {}'.format(chosenfile[0])) if __name__ == '__main__': main()