made more efficient version that doesn't read every file unnecessarily. currently not detecting filetype on rst files though, resulting in those being acceptable files

author: thomas grothe <grothe.tr@gmail.com> 2023-11-30 12:04:59 -0600
committer: thomas grothe <grothe.tr@gmail.com> 2023-11-30 12:04:59 -0600
commit: c88bf4ed44f3366275034360101d64a4c060b410 (patch)
tree: b7c9f0025bca5282b1de737ea1107336d0b6a302
parent: 887489c283ba2bbf0538e7eb7c4531ca7c07a769 (diff)
2 files changed, 105 insertions, 12 deletions
diff --git a/wow.py b/wow.py
index de563e8..01b6e3a 100755
--- a/wow.py
+++ b/wow.py
@@ -8,6 +8,7 @@ import time
 import filetype #for filetype (extension and mime type)
 import chardet #for getting encoding
 import subprocess
+import random
 
 total_text = ''
 
@@ -16,7 +17,7 @@ def addSampleFileIfTxt(filepath):
     if v: print('checking {}'.format(filepath))
     if os.path.isfile(filepath):
         f = open(filepath, 'rb')
-        fb = f.read() #file data (bytes)
+        fb = f.read() #file data (bytes). unfortunately have to read the file to detect encoding
         enc = str(chardet.detect(fb)['encoding'])
         ftype = filetype.guess(filepath)
         if v: print('filetype: {}'.format(str(ftype)))
@@ -24,14 +25,15 @@ def addSampleFileIfTxt(filepath):
         if ftype != None:
             if ftype.extension in ['py', 'c', 'cc', 'h', 'hh', 'java']: #don't want code in the sample data
                 return
-            if ftype.extension == 'odt': 
-                if v: print('converting odt: {}'.format(filepath))
-                subproc = subprocess.run(['odt2txt', filepath], encoding='utf-8', stdout=subprocess.PIPE)
-                total_text += str(subproc.stdout)
+            if ftype.extension == 'odt':
+                samplefiles.append(filepath)
+                #if v: print('converting odt: {}'.format(filepath))
+                #subproc = subprocess.run(['odt2txt', filepath], encoding='utf-8', stdout=subprocess.PIPE)
+                #total_text += str(subproc.stdout)
         if enc in ['ascii','utf-8']:
             if v: print(fb)
-            total_text += str(fb, encoding='utf-8')            
-            
+            samplefiles.append(filepath)
+            #total_text += str(fb, encoding='utf-8')
         else:
             if v: print('{} is not txt')
     else: 
@@ -40,7 +42,7 @@ def addSampleFileIfTxt(filepath):
 
 #paths=['/home/thomas/doc/fiction']
 paths=['/home/thomas/doc/j/', '/home/thomas/_poetry', '/home/thomas/doc/_journal_2019'] #the paths to scan recursively for files from which to grab text
-samplefiles=[] #the individual files we want to grab text from
+samplefiles=[] #the paths of the individual files from which we want to grab text
 
 v=False #verbose 
 tStart = time.time()
@@ -54,12 +56,24 @@ for p in paths:
         else:
                addSampleFileIfTxt(p)
 
-#now we have all of our files of interest. so pick a random one
-
 tEnd = time.time()
 tDuration = tEnd - tStart
-print('report generated in {} seconds, from paths {}'.format(tDuration, str(paths)))
-print(total_text)
+print('gathered {} acceptable files in {} seconds, from paths {}'.format(len(samplefiles), tDuration, str(paths)))
+
+#now we have all of our files of interest. so pick a random one
+fi = random.randint(0, len(samplefiles)) # file index
+fc = samplefiles[fi] # the chosen one
+print('chose file ' + fc)
+f = open(fc, 'rb')
+fb = f.read()
+ftype = filetype.guess(fc)
+if ftype != None and ftype.extension == 'odt':
+     subproc = subprocess.run(['odt2txt', fc], encoding='utf-8', stdout=subprocess.PIPE)
+     print(subproc.stdout)
+else:
+     print(str(f.read(), encoding='utf-8'))
+
+
 #for f in samplefiles:
 #    print(f)
 
diff --git a/wow2.py b/wow2.py
new file mode 100755
index 0000000..7e9bc69
--- /dev/null
+++ b/wow2.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python
+#Words Of Wisdom
+# output some random text from my journals and writings
+
+import os
+import sys
+import time
+import filetype #for filetype (extension and mime type)
+import chardet #for getting encoding
+import subprocess
+import random
+
+total_text = ''
+
+#can we parse text out of this file or is it a libreoffice file? if so, return the contents
+def attemptReadSampleFile(filepath):
+    if v: print('checking {}'.format(filepath))
+    if os.path.isfile(filepath):
+        f = open(filepath, 'rb')
+        ftype = filetype.guess(filepath)
+        if v: print('filetype: {}'.format(str(ftype)))
+        if ftype != None:
+            if ftype.extension in ['py', 'c', 'cc', 'h', 'hh', 'java', 'rst', 'css', 'html', 'htm', 'js', 'php']: #don't want code in the sample data
+                if v: print('this file is code')
+                return None
+            if ftype.extension == 'odt':
+                subproc = subprocess.run(['odt2txt', filepath], encoding='utf-8', stdout=subprocess.PIPE)
+                return subproc.stdout
+        else:
+            fb = f.read() #file data (bytes) to detect encoding
+            enc = str(chardet.detect(fb)['encoding'])
+            if v: print('encoding: {}'.format(enc))
+            if enc in ['ascii','utf-8']:
+                return str(fb, encoding=enc)
+            else:
+                return None
+    else: 
+            if v: print('not file')
+            return None
+
+
+#paths=['/home/thomas/doc/fiction']
+paths=['/home/thomas/doc/j/', '/home/thomas/_poetry', '/home/thomas/doc/_journal_2019'] #the paths to scan recursively for files from which to grab text
+samplefiles=[] #the paths of the individual files from which we want to grab text
+
+v=True #verbose 
+tStart = time.time()
+for p in paths:
+        if v: print('path {}'.format(p))
+        if os.path.isdir(p):
+                for root,dirs,files in os.walk(p):
+                        if v: print('walk {}: {} files, {} dirs'.format(root, len(files), len(dirs)))
+                        for f in files:
+                            samplefiles.append(root + '/' + f)
+        else:
+               samplefiles.append(p)
+
+tEnd = time.time()
+tDuration = tEnd - tStart
+print('gathered {} candidate files in {} seconds, from paths {}'.format(len(samplefiles), tDuration, str(paths)))
+
+#pick random file until we get an acceptable one
+fi = random.randint(0, len(samplefiles))
+t = attemptReadSampleFile(samplefiles[fi])
+while t == None:
+    del samplefiles[fi]
+    fi = random.randint(0, len(samplefiles))
+    t = attemptReadSampleFile(samplefiles[fi])
+
+print()
+print('{} : {}'.format(samplefiles[fi], t))
+
+
+#for f in samplefiles:
+#    print(f)
+
+
+
+
author	thomas grothe <grothe.tr@gmail.com>	2023-11-30 12:04:59 -0600
committer	thomas grothe <grothe.tr@gmail.com>	2023-11-30 12:04:59 -0600
commit	c88bf4ed44f3366275034360101d64a4c060b410 (patch)
tree	b7c9f0025bca5282b1de737ea1107336d0b6a302
parent	887489c283ba2bbf0538e7eb7c4531ca7c07a769 (diff)