2 files changed, 336 insertions, 6 deletions
diff --git a/filetreetool.py b/filetreetool.py
new file mode 100755
index 0000000..54d571a
--- /dev/null
+++ b/filetreetool.py
@@ -0,0 +1,332 @@
+#!/usr/bin/python
+#a tool for traversing a directory structure and operating on its files
+# the directory structure is a tree, where each leaf is a file (or symlink pointing to the memory address of another directory node).
+# so it can be stored in memory as a tree datastructure.  
+
+
+import os
+import sys
+import time
+import filetype #for filetype (extension and mime type)
+import chardet #for getting encoding
+import subprocess
+import random
+import hashlib
+import argparse
+from datetime import datetime
+
+#can we parse text out of this file or is it a libreoffice file? if so, return the contents
+def attemptReadSampleFile(filepath):    
+    if verbose: print('checking {}'.format(filepath))
+    if os.path.isfile(filepath):
+        f = open(filepath, 'rb')
+        ftype = filetype.guess(filepath)
+        if verbose: print('filetype: {}'.format(str(ftype)))
+        if ftype != None:
+            if ftype.extension in ['py', 'c', 'cc', 'h', 'hh', 'java', 'rst', 'css', 'html', 'htm', 'js', 'php', 'sh']: #don't want code in the sample data
+                if verbose: print('this file is code')
+                return None
+            if ftype.extension == 'odt' and filepath[len(filepath)-1] != '#': #openoffice doc and not a lock file
+                return cmd('odt2txt ' + filepath)
+        else:
+            fb = f.read() #file data (bytes) to detect encoding
+            enc = str(chardet.detect(fb)['encoding'])
+            if verbose: print('encoding: {}'.format(enc))
+            if enc in ['ascii','utf-8']:
+                return str(fb, encoding=enc)
+            else:
+                return None
+    else: 
+            if verbose: print('not file')
+            return None
+
+def getFilesRecursive(paths):
+    samplefiles=[] #the paths of the individual files from which we want to grab text
+    for p in paths:
+            if verbose: print('path {}'.format(p))
+            if os.path.isdir(p):
+                    for root,dirs,files in os.walk(p):
+                            if verbose: print('walk {}: {} files, {} dirs'.format(root, len(files), len(dirs)))
+                            for f in files:
+                                samplefiles.append(root + '/' + f)
+            else:
+                   samplefiles.append(p)
+    return samplefiles
+
+#returns some # of the most recently modified files in a directory, scanning recursively
+def getRecentlyModifiedFiles(dir, n=10):
+    fileswmodtimes = []
+    for root,dirs,files in os.walk(dir):
+        for f in files:
+            fileswmodtimes.append( (f'{root}/{f}', os.path.getmtime(f'{root}/{f}')) )
+    return sorted(fileswmodtimes, key=lambda x: x[1], reverse=True)[:n]
+    #if include_time:
+    #    return sorted(fileswmodtimes, key=lambda x: x[1], reverse=True)[:n]
+    #else:
+    #    return [f[0] for f in sorted(fileswmodtimes, key=lambda x: os.path.getmtime(x[0]), reverse=True)[:n]]
+
+#returns duplicate files from some given lists of files
+def findDuplicates(*args):
+    for arg in args:
+         if not isinstance(arg, str):
+             raise TypeError('all arguments must be strings of file paths')
+    filelists = []
+    dups = [] #duplicate files found, as a list<tuple<int,int>> grouping together each file as mappings of filelist index to file index within that list. 
+    for arg in args:
+        if os.path.isdir(arg):
+            fl = []
+            for root,dirs,files in os.walk(arg):
+                for f in files:
+                    fl.append((f'{root}/{f}',f,os.path.getsize(f'{root}/{f}'),'')) #tuple<path, name, size, hash>
+            filelists.append(fl)
+        elif ':' in arg: #remote file
+            tmp = arg.split(':')
+            host = tmp[0]
+            user = None
+            if '@' in host:
+                userhost = host.split('@')
+                user = userhost[0]
+                host = userhost[1]
+            path = tmp[1]
+            #TODO add remote file support
+        else:
+            raise ValueError('argument must be a directory or remote file path')
+            
+    #now we have a list of lists of files. look for duplicates via a few different approaches
+    
+    #union = {}
+    #intersection = set(filelists[0])
+    #for fl in filelists:
+    #    union = union | set(fl)
+    #    intersection = intersection & set(fl)
+
+
+    #first, check for identical paths
+    for i in range(len(filelists)):
+        flist1 = filelists[i]
+        for ii in range(len(flist1)):
+            f1 = flist1[ii]
+            for j in range(i+1, len(filelists)):
+                flist2 = filelists[j]
+                if flist2 != flist1:
+                    for jj in range(len(flist2)):
+                        f2 = flist2[jj]
+                        #recall that each element of the filelist is a tuple<filepath,filename,size,hash(not-yet-calculated)>
+                        if f1[0] == f2[0]: #exact same filepath, probably same file, check md5
+                            if f1[2] == f2[2]: #first of all check size
+                                dups.append([(i,ii),(j,jj)])
+                            fb = open(f1[0],'r')
+                            hash1 = hashlib.md5(fb).hexdigest()
+                            fb.close()
+                            fb = open(f2[0],'r')
+                            hash2 = hashlib.md5(fb).hexdigest()
+                            fb.close()    
+    return None #todo     
+
+def checkSimilarityOfDuplicates(dups, dir1, dir2):
+    for d in dups:
+        f1 = open(dir1 + '/' + d, 'rb')
+        f2 = open(dir2 + '/' + d, 'rb')
+
+#returns a list of files that are in both lists of files from each given directory
+def findDuplicateFiles(dir1, dir2):
+    if not (os.path.isdir(dir1) and os.path.isdir(dir2)):
+         return -1
+    fl1 = [] #list of file paths
+    fl2 = []
+    for root,dirs,files in os.walk(dir1):
+        for f in files:
+            fl1.append((f'{root}/{f}',f,os.path.getsize(f'{root}/{f}'),'')) #tuple<path, name, size, hash>
+    for root,dirs,files in os.walk(dir2):
+        for f in files:
+            fl2.append((f'{root}/{f}',f,os.path.getsize(f'{root}/{f}'),'')) #tuple<path, name, size, hash>    
+    
+#    for i in range(len(fl1)):
+ #       for j in range(i+1, len(fl2)):
+  #          #TODO            
+
+   # return list(set(l1) & set(l2))
+
+#looks at all the files of the 2 given dirs, returns the following sets:
+#   unique to 1: files that only exist in dir1
+#   unique to 2: files that only exist in dir2
+#   duplicates: files that exist in both dirs
+#this is done by filename only, and does not take into account the full path of the files, unless the arg fullpath = True. 
+#so 2 files could have the same name yet contain different data. use the function compareFileHashes() to figure out if files with same name have different contents. 
+#keyword args:
+#   fullpath: return the full path as opposed to just the filename
+#   pretty: return formatted text. default true because this tool is primarily used in cmd line
+def getDupFiles(dir1, dir2, fullpath=False, pretty=True):
+    #files1 = str(cmd(f'find {dir1} -type f'), encoding='utf-8').split('\n')
+    #files2 = str(cmd(f'find {dir2} -type f'), encoding='utf-8').split('\n')
+    files1 = cmd(f'find {dir1} -type f').split('\n')
+    files2 = cmd(f'find {dir2} -type f').split('\n')
+
+    #make lists of filenames, without paths
+    filenames1 = []
+    filenames2 = []
+    for fpath in files1:
+        tmp=fpath.split('/')
+        filenames1.append(tmp[len(tmp)-1])
+    for fpath in files2:
+        tmp=fpath.split('/')
+        filenames2.append(tmp[len(tmp)-1])
+    dupFiles = (set(filenames1) & set(filenames2)) - {' ','','.','..'}
+
+    #TODO return fullpaths option
+    if pretty:
+        res = ''
+        for fn in dupFiles:
+            res += fn+'\n'
+        print(res)
+    else:
+        print(dupFiles)
+    return dupFiles
+
+def getDuplicateFiles(dir1, dir2, fullpath=False, pretty=True):
+    return getDupFiles(dir1, dir2, fullpath, pretty)
+def getDupeFiles(dir1, dir2, fullpath=False, pretty=True):
+    return getDupFiles(dir1, dir2, fullpath, pretty)
+
+
+def getUniqueFiles(dir1, dir2, fullpath=False, pretty=True):
+    files1 = str(cmd(f'find {dir1} -type f'), encoding='utf-8').split('\n')
+    files2 = str(cmd(f'find {dir2} -type f'), encoding='utf-8').split('\n')
+
+    #make lists of filenames, without paths, if applicable
+    if not fullpath:
+        filenames1 = []
+        filenames2 = []
+        for fpath in files1:
+            tmp=fpath.split('/')
+            filenames1.append(tmp[len(tmp)-1])
+        for fpath in files2:
+            tmp=fpath.split('/')
+            filenames2.append(tmp[len(tmp)-1])
+    union = (set(filenames1) | set(filenames2)) - {' ','','.','..'}
+    uniq1 = union - set(filenames2)
+    uniq2 = union - set(filenames1)
+
+    if pretty:
+        res = 'unique to 1:\n'
+        for fn in uniq1:
+            res += fn+'\n'
+        res += 'unique to 2:\n'
+        for fn in uniq2:
+            res += fn+'\n'
+        print(res)
+    else:
+        print(uniq1,uniq2)
+    return (uniq1,uniq2)
+def getUniqFiles(dir1, dir2, fullpath=False, pretty=True):
+    return getUniqueFiles(dir1, dir2, fullpath, pretty)
+
+def getDupeFilesHash(dir1, dir2, fullpath=False, pretty=True):
+    filesWithHash = []
+    for f in getDupFiles(dir1, dir2, fullpath=False, pretty=False):
+        f1 = os.system(f'find {dir1} | grep {f}') #TODO continue work here
+        f2 = os.system(f'find {dir2} | grep {f}')
+        print(f1)
+        #hash1 = cmd(f'md5sum {f1}')
+        #hash2 = cmd(f'md5sum {f2}')
+        #print(hash1+' '+hash2)
+        filesWithHash.append((f,hash))
+    #print(filesWithHash)
+    return filesWithHash    
+    
+
+
+
+#execute a command
+'''def cmd(cmdStr):
+    return subprocess.run(cmdStr.join(' '), encoding='utf-8', stdout=subprocess.PIPE).stdout
+'''
+'''
+execute a command using the python subprocess module
+params:
+    cmdstr (str) : the command to run
+return: stdout of command
+'''
+def cmd(cmdstr,v=False):
+    cmdarray = cmdstr.strip().split(' ')
+    log(f'runcmd: {cmdstr}')
+    '''if '|' in cmdarray:   #TODO handling pipe not yet working
+        i = cmdarray.index('|')  
+        proc0 = subprocess.check_output(cmdstr, shell=True)'''
+    proc = subprocess.run(cmdarray, stdout=subprocess.PIPE)
+    if proc.returncode == 0:
+        res = proc.stdout
+        if verbose:
+            log(f'result: {res}')
+    else:
+        log(f'returncode = {proc.returncode}')
+        res = proc.stderr
+    return str(res, encoding='utf-8')
+
+def log(msg):
+    t = tnow()
+    with open('filetreetool.log', 'a') as lf:
+        lf.write(f'{t}: {msg}\n')
+        lf.close()
+    if verbose: print(f'LOG: {msg}')
+
+def tnow():
+    '''
+    return: current time, formatted as %Y%m%d-%H%M%S
+    ''' 
+    return datetime.now().strftime('%Y%m%d-%H%M%S')
+
+verbose=False #verbose 
+
+def callFunc(func_name, *args, **kwargs):
+    """
+    Calls a function by its name if it exists in the global scope.
+    
+    :param func_name: (string) name of the function to call
+    :param args: (List<string>) Positional arguments to pass to the function
+    :param kwargs: (List<tuple<string>>) Keyword arguments to pass to the function
+    :return: The result of the function call if successful, None otherwise
+    """
+    #TODO keep a log of called functions and cached result, to use for optimization
+    if func_name in globals() and callable(globals()[func_name]):
+        func = globals()[func_name]
+        return func(*args, **kwargs)
+    else:
+        print(f'function "{func_name}" not callable')
+        return None        
+    
+
+if __name__ == '__main__':
+    '''
+    older less-dynamic implementation
+    parser = argparse.ArgumentParser(description='a tool to analyze and manage files in a directory structure')
+    parser.add_argument('func', help='module function to call')
+    parser.add_argument('args', nargs='*', help='function args')
+    args = parser.parse_args()
+
+    if args.func == 'compareDupesOfFiles':
+        if len(args.args < 2):
+            print('need (dir1, dir2)')
+            sys.exit(1)
+        result = compareDupesOfFiles(args.args[0], args.args[1])'''
+    if len(sys.argv) == 1:
+        print('a tool for traversing a directory structure and operating on its files. \
+            the directory structure is a tree, where each leaf is a file (or symlink pointing to the memory address of another directory node). \
+            so it can be stored in memory as a tree datastructure.  ')
+        sys.exit(0)
+    if len(sys.argv) == 2:
+        callFunc(sys.argv[1])
+    elif len(sys.argv) > 2:
+        args = [] #separate out normal args and keyword args
+        kwargs = []
+        for a in sys.argv[2:]:
+            if '=' in a:
+                tmp = a.split('=')
+                if len(tmp) != 2:
+                    print(f'invalid: {a}')
+                    sys.exit(1)
+                kwargs.append([tmp[0],tmp[1]])
+            else:
+                args.append(a)
+        #print(f'calling {sys.argv[1]}({",".join(args)}  {kwargs})')
+        callFunc(sys.argv[1], *args, *kwargs)
diff --git a/ogrep b/ogrep
index f3d2aa9..cba02c2 100755
--- a/ogrep
+++ b/ogrep
@@ -5,11 +5,9 @@ if [[ -z $1 || -z $2 ]]; then
 	echo "usage: ogrep [pattern] [file(s)]"
 	exit 0
 fi
-for f in `ls ${2}`; do
-	if [[ ! ${f} == *".odt" ]]; then
-		continue
+for f in `find ${2} -type f -name "*odt"`; do
+	res=`odt2txt ${f} | grep ${1}`
+	if [[ $res ]]; then
+		echo "${f}: ${res}"
 	fi
-	echo "got " ${f}
-	echo "${f}: "
-	odt2txt ${f} | grep ${1}
 done