diff options
| author | thomas grothe <grothe.tr@gmail.com> | 2023-11-30 12:04:59 -0600 |
|---|---|---|
| committer | thomas grothe <grothe.tr@gmail.com> | 2023-11-30 12:04:59 -0600 |
| commit | c88bf4ed44f3366275034360101d64a4c060b410 (patch) | |
| tree | b7c9f0025bca5282b1de737ea1107336d0b6a302 | |
| parent | 887489c283ba2bbf0538e7eb7c4531ca7c07a769 (diff) | |
made more efficient version that doesn't read every file unnecessarily. currently not detecting filetype on rst files though, resulting in those being acceptable files
| -rwxr-xr-x | wow.py | 38 | ||||
| -rwxr-xr-x | wow2.py | 79 |
2 files changed, 105 insertions, 12 deletions
@@ -8,6 +8,7 @@ import time import filetype #for filetype (extension and mime type) import chardet #for getting encoding import subprocess +import random total_text = '' @@ -16,7 +17,7 @@ def addSampleFileIfTxt(filepath): if v: print('checking {}'.format(filepath)) if os.path.isfile(filepath): f = open(filepath, 'rb') - fb = f.read() #file data (bytes) + fb = f.read() #file data (bytes). unfortunately have to read the file to detect encoding enc = str(chardet.detect(fb)['encoding']) ftype = filetype.guess(filepath) if v: print('filetype: {}'.format(str(ftype))) @@ -24,14 +25,15 @@ def addSampleFileIfTxt(filepath): if ftype != None: if ftype.extension in ['py', 'c', 'cc', 'h', 'hh', 'java']: #don't want code in the sample data return - if ftype.extension == 'odt': - if v: print('converting odt: {}'.format(filepath)) - subproc = subprocess.run(['odt2txt', filepath], encoding='utf-8', stdout=subprocess.PIPE) - total_text += str(subproc.stdout) + if ftype.extension == 'odt': + samplefiles.append(filepath) + #if v: print('converting odt: {}'.format(filepath)) + #subproc = subprocess.run(['odt2txt', filepath], encoding='utf-8', stdout=subprocess.PIPE) + #total_text += str(subproc.stdout) if enc in ['ascii','utf-8']: if v: print(fb) - total_text += str(fb, encoding='utf-8') - + samplefiles.append(filepath) + #total_text += str(fb, encoding='utf-8') else: if v: print('{} is not txt') else: @@ -40,7 +42,7 @@ def addSampleFileIfTxt(filepath): #paths=['/home/thomas/doc/fiction'] paths=['/home/thomas/doc/j/', '/home/thomas/_poetry', '/home/thomas/doc/_journal_2019'] #the paths to scan recursively for files from which to grab text -samplefiles=[] #the individual files we want to grab text from +samplefiles=[] #the paths of the individual files from which we want to grab text v=False #verbose tStart = time.time() @@ -54,12 +56,24 @@ for p in paths: else: addSampleFileIfTxt(p) -#now we have all of our files of interest. so pick a random one - tEnd = time.time() tDuration = tEnd - tStart -print('report generated in {} seconds, from paths {}'.format(tDuration, str(paths))) -print(total_text) +print('gathered {} acceptable files in {} seconds, from paths {}'.format(len(samplefiles), tDuration, str(paths))) + +#now we have all of our files of interest. so pick a random one +fi = random.randint(0, len(samplefiles)) # file index +fc = samplefiles[fi] # the chosen one +print('chose file ' + fc) +f = open(fc, 'rb') +fb = f.read() +ftype = filetype.guess(fc) +if ftype != None and ftype.extension == 'odt': + subproc = subprocess.run(['odt2txt', fc], encoding='utf-8', stdout=subprocess.PIPE) + print(subproc.stdout) +else: + print(str(f.read(), encoding='utf-8')) + + #for f in samplefiles: # print(f) @@ -0,0 +1,79 @@ +#!/usr/bin/python +#Words Of Wisdom +# output some random text from my journals and writings + +import os +import sys +import time +import filetype #for filetype (extension and mime type) +import chardet #for getting encoding +import subprocess +import random + +total_text = '' + +#can we parse text out of this file or is it a libreoffice file? if so, return the contents +def attemptReadSampleFile(filepath): + if v: print('checking {}'.format(filepath)) + if os.path.isfile(filepath): + f = open(filepath, 'rb') + ftype = filetype.guess(filepath) + if v: print('filetype: {}'.format(str(ftype))) + if ftype != None: + if ftype.extension in ['py', 'c', 'cc', 'h', 'hh', 'java', 'rst', 'css', 'html', 'htm', 'js', 'php']: #don't want code in the sample data + if v: print('this file is code') + return None + if ftype.extension == 'odt': + subproc = subprocess.run(['odt2txt', filepath], encoding='utf-8', stdout=subprocess.PIPE) + return subproc.stdout + else: + fb = f.read() #file data (bytes) to detect encoding + enc = str(chardet.detect(fb)['encoding']) + if v: print('encoding: {}'.format(enc)) + if enc in ['ascii','utf-8']: + return str(fb, encoding=enc) + else: + return None + else: + if v: print('not file') + return None + + +#paths=['/home/thomas/doc/fiction'] +paths=['/home/thomas/doc/j/', '/home/thomas/_poetry', '/home/thomas/doc/_journal_2019'] #the paths to scan recursively for files from which to grab text +samplefiles=[] #the paths of the individual files from which we want to grab text + +v=True #verbose +tStart = time.time() +for p in paths: + if v: print('path {}'.format(p)) + if os.path.isdir(p): + for root,dirs,files in os.walk(p): + if v: print('walk {}: {} files, {} dirs'.format(root, len(files), len(dirs))) + for f in files: + samplefiles.append(root + '/' + f) + else: + samplefiles.append(p) + +tEnd = time.time() +tDuration = tEnd - tStart +print('gathered {} candidate files in {} seconds, from paths {}'.format(len(samplefiles), tDuration, str(paths))) + +#pick random file until we get an acceptable one +fi = random.randint(0, len(samplefiles)) +t = attemptReadSampleFile(samplefiles[fi]) +while t == None: + del samplefiles[fi] + fi = random.randint(0, len(samplefiles)) + t = attemptReadSampleFile(samplefiles[fi]) + +print() +print('{} : {}'.format(samplefiles[fi], t)) + + +#for f in samplefiles: +# print(f) + + + + |
