summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgrothedev <grothedev@gmail.com>2024-07-01 20:01:31 -0500
committergrothedev <grothedev@gmail.com>2024-07-01 20:01:31 -0500
commitf70b6c9bfe22f01e89fe328f80dd0254a9116b31 (patch)
tree721863b8bd404c7419c75e1b5d77babaed7b196d
parent2feba75a80261b7b8dfb257218e6d0ff3546ca14 (diff)
add filetreetool ive been working on to help with organize my files and backups
-rwxr-xr-xfiletreetool.py332
-rwxr-xr-xogrep10
2 files changed, 336 insertions, 6 deletions
diff --git a/filetreetool.py b/filetreetool.py
new file mode 100755
index 0000000..54d571a
--- /dev/null
+++ b/filetreetool.py
@@ -0,0 +1,332 @@
+#!/usr/bin/python
+#a tool for traversing a directory structure and operating on its files
+# the directory structure is a tree, where each leaf is a file (or symlink pointing to the memory address of another directory node).
+# so it can be stored in memory as a tree datastructure.
+
+
+import os
+import sys
+import time
+import filetype #for filetype (extension and mime type)
+import chardet #for getting encoding
+import subprocess
+import random
+import hashlib
+import argparse
+from datetime import datetime
+
+#can we parse text out of this file or is it a libreoffice file? if so, return the contents
+def attemptReadSampleFile(filepath):
+ if verbose: print('checking {}'.format(filepath))
+ if os.path.isfile(filepath):
+ f = open(filepath, 'rb')
+ ftype = filetype.guess(filepath)
+ if verbose: print('filetype: {}'.format(str(ftype)))
+ if ftype != None:
+ if ftype.extension in ['py', 'c', 'cc', 'h', 'hh', 'java', 'rst', 'css', 'html', 'htm', 'js', 'php', 'sh']: #don't want code in the sample data
+ if verbose: print('this file is code')
+ return None
+ if ftype.extension == 'odt' and filepath[len(filepath)-1] != '#': #openoffice doc and not a lock file
+ return cmd('odt2txt ' + filepath)
+ else:
+ fb = f.read() #file data (bytes) to detect encoding
+ enc = str(chardet.detect(fb)['encoding'])
+ if verbose: print('encoding: {}'.format(enc))
+ if enc in ['ascii','utf-8']:
+ return str(fb, encoding=enc)
+ else:
+ return None
+ else:
+ if verbose: print('not file')
+ return None
+
+def getFilesRecursive(paths):
+ samplefiles=[] #the paths of the individual files from which we want to grab text
+ for p in paths:
+ if verbose: print('path {}'.format(p))
+ if os.path.isdir(p):
+ for root,dirs,files in os.walk(p):
+ if verbose: print('walk {}: {} files, {} dirs'.format(root, len(files), len(dirs)))
+ for f in files:
+ samplefiles.append(root + '/' + f)
+ else:
+ samplefiles.append(p)
+ return samplefiles
+
+#returns some # of the most recently modified files in a directory, scanning recursively
+def getRecentlyModifiedFiles(dir, n=10):
+ fileswmodtimes = []
+ for root,dirs,files in os.walk(dir):
+ for f in files:
+ fileswmodtimes.append( (f'{root}/{f}', os.path.getmtime(f'{root}/{f}')) )
+ return sorted(fileswmodtimes, key=lambda x: x[1], reverse=True)[:n]
+ #if include_time:
+ # return sorted(fileswmodtimes, key=lambda x: x[1], reverse=True)[:n]
+ #else:
+ # return [f[0] for f in sorted(fileswmodtimes, key=lambda x: os.path.getmtime(x[0]), reverse=True)[:n]]
+
+#returns duplicate files from some given lists of files
+def findDuplicates(*args):
+ for arg in args:
+ if not isinstance(arg, str):
+ raise TypeError('all arguments must be strings of file paths')
+ filelists = []
+ dups = [] #duplicate files found, as a list<tuple<int,int>> grouping together each file as mappings of filelist index to file index within that list.
+ for arg in args:
+ if os.path.isdir(arg):
+ fl = []
+ for root,dirs,files in os.walk(arg):
+ for f in files:
+ fl.append((f'{root}/{f}',f,os.path.getsize(f'{root}/{f}'),'')) #tuple<path, name, size, hash>
+ filelists.append(fl)
+ elif ':' in arg: #remote file
+ tmp = arg.split(':')
+ host = tmp[0]
+ user = None
+ if '@' in host:
+ userhost = host.split('@')
+ user = userhost[0]
+ host = userhost[1]
+ path = tmp[1]
+ #TODO add remote file support
+ else:
+ raise ValueError('argument must be a directory or remote file path')
+
+ #now we have a list of lists of files. look for duplicates via a few different approaches
+
+ #union = {}
+ #intersection = set(filelists[0])
+ #for fl in filelists:
+ # union = union | set(fl)
+ # intersection = intersection & set(fl)
+
+
+ #first, check for identical paths
+ for i in range(len(filelists)):
+ flist1 = filelists[i]
+ for ii in range(len(flist1)):
+ f1 = flist1[ii]
+ for j in range(i+1, len(filelists)):
+ flist2 = filelists[j]
+ if flist2 != flist1:
+ for jj in range(len(flist2)):
+ f2 = flist2[jj]
+ #recall that each element of the filelist is a tuple<filepath,filename,size,hash(not-yet-calculated)>
+ if f1[0] == f2[0]: #exact same filepath, probably same file, check md5
+ if f1[2] == f2[2]: #first of all check size
+ dups.append([(i,ii),(j,jj)])
+ fb = open(f1[0],'r')
+ hash1 = hashlib.md5(fb).hexdigest()
+ fb.close()
+ fb = open(f2[0],'r')
+ hash2 = hashlib.md5(fb).hexdigest()
+ fb.close()
+ return None #todo
+
+def checkSimilarityOfDuplicates(dups, dir1, dir2):
+ for d in dups:
+ f1 = open(dir1 + '/' + d, 'rb')
+ f2 = open(dir2 + '/' + d, 'rb')
+
+#returns a list of files that are in both lists of files from each given directory
+def findDuplicateFiles(dir1, dir2):
+ if not (os.path.isdir(dir1) and os.path.isdir(dir2)):
+ return -1
+ fl1 = [] #list of file paths
+ fl2 = []
+ for root,dirs,files in os.walk(dir1):
+ for f in files:
+ fl1.append((f'{root}/{f}',f,os.path.getsize(f'{root}/{f}'),'')) #tuple<path, name, size, hash>
+ for root,dirs,files in os.walk(dir2):
+ for f in files:
+ fl2.append((f'{root}/{f}',f,os.path.getsize(f'{root}/{f}'),'')) #tuple<path, name, size, hash>
+
+# for i in range(len(fl1)):
+ # for j in range(i+1, len(fl2)):
+ # #TODO
+
+ # return list(set(l1) & set(l2))
+
+#looks at all the files of the 2 given dirs, returns the following sets:
+# unique to 1: files that only exist in dir1
+# unique to 2: files that only exist in dir2
+# duplicates: files that exist in both dirs
+#this is done by filename only, and does not take into account the full path of the files, unless the arg fullpath = True.
+#so 2 files could have the same name yet contain different data. use the function compareFileHashes() to figure out if files with same name have different contents.
+#keyword args:
+# fullpath: return the full path as opposed to just the filename
+# pretty: return formatted text. default true because this tool is primarily used in cmd line
+def getDupFiles(dir1, dir2, fullpath=False, pretty=True):
+ #files1 = str(cmd(f'find {dir1} -type f'), encoding='utf-8').split('\n')
+ #files2 = str(cmd(f'find {dir2} -type f'), encoding='utf-8').split('\n')
+ files1 = cmd(f'find {dir1} -type f').split('\n')
+ files2 = cmd(f'find {dir2} -type f').split('\n')
+
+ #make lists of filenames, without paths
+ filenames1 = []
+ filenames2 = []
+ for fpath in files1:
+ tmp=fpath.split('/')
+ filenames1.append(tmp[len(tmp)-1])
+ for fpath in files2:
+ tmp=fpath.split('/')
+ filenames2.append(tmp[len(tmp)-1])
+ dupFiles = (set(filenames1) & set(filenames2)) - {' ','','.','..'}
+
+ #TODO return fullpaths option
+ if pretty:
+ res = ''
+ for fn in dupFiles:
+ res += fn+'\n'
+ print(res)
+ else:
+ print(dupFiles)
+ return dupFiles
+
+def getDuplicateFiles(dir1, dir2, fullpath=False, pretty=True):
+ return getDupFiles(dir1, dir2, fullpath, pretty)
+def getDupeFiles(dir1, dir2, fullpath=False, pretty=True):
+ return getDupFiles(dir1, dir2, fullpath, pretty)
+
+
+def getUniqueFiles(dir1, dir2, fullpath=False, pretty=True):
+ files1 = str(cmd(f'find {dir1} -type f'), encoding='utf-8').split('\n')
+ files2 = str(cmd(f'find {dir2} -type f'), encoding='utf-8').split('\n')
+
+ #make lists of filenames, without paths, if applicable
+ if not fullpath:
+ filenames1 = []
+ filenames2 = []
+ for fpath in files1:
+ tmp=fpath.split('/')
+ filenames1.append(tmp[len(tmp)-1])
+ for fpath in files2:
+ tmp=fpath.split('/')
+ filenames2.append(tmp[len(tmp)-1])
+ union = (set(filenames1) | set(filenames2)) - {' ','','.','..'}
+ uniq1 = union - set(filenames2)
+ uniq2 = union - set(filenames1)
+
+ if pretty:
+ res = 'unique to 1:\n'
+ for fn in uniq1:
+ res += fn+'\n'
+ res += 'unique to 2:\n'
+ for fn in uniq2:
+ res += fn+'\n'
+ print(res)
+ else:
+ print(uniq1,uniq2)
+ return (uniq1,uniq2)
+def getUniqFiles(dir1, dir2, fullpath=False, pretty=True):
+ return getUniqueFiles(dir1, dir2, fullpath, pretty)
+
+def getDupeFilesHash(dir1, dir2, fullpath=False, pretty=True):
+ filesWithHash = []
+ for f in getDupFiles(dir1, dir2, fullpath=False, pretty=False):
+ f1 = os.system(f'find {dir1} | grep {f}') #TODO continue work here
+ f2 = os.system(f'find {dir2} | grep {f}')
+ print(f1)
+ #hash1 = cmd(f'md5sum {f1}')
+ #hash2 = cmd(f'md5sum {f2}')
+ #print(hash1+' '+hash2)
+ filesWithHash.append((f,hash))
+ #print(filesWithHash)
+ return filesWithHash
+
+
+
+
+#execute a command
+'''def cmd(cmdStr):
+ return subprocess.run(cmdStr.join(' '), encoding='utf-8', stdout=subprocess.PIPE).stdout
+'''
+'''
+execute a command using the python subprocess module
+params:
+ cmdstr (str) : the command to run
+return: stdout of command
+'''
+def cmd(cmdstr,v=False):
+ cmdarray = cmdstr.strip().split(' ')
+ log(f'runcmd: {cmdstr}')
+ '''if '|' in cmdarray: #TODO handling pipe not yet working
+ i = cmdarray.index('|')
+ proc0 = subprocess.check_output(cmdstr, shell=True)'''
+ proc = subprocess.run(cmdarray, stdout=subprocess.PIPE)
+ if proc.returncode == 0:
+ res = proc.stdout
+ if verbose:
+ log(f'result: {res}')
+ else:
+ log(f'returncode = {proc.returncode}')
+ res = proc.stderr
+ return str(res, encoding='utf-8')
+
+def log(msg):
+ t = tnow()
+ with open('filetreetool.log', 'a') as lf:
+ lf.write(f'{t}: {msg}\n')
+ lf.close()
+ if verbose: print(f'LOG: {msg}')
+
+def tnow():
+ '''
+ return: current time, formatted as %Y%m%d-%H%M%S
+ '''
+ return datetime.now().strftime('%Y%m%d-%H%M%S')
+
+verbose=False #verbose
+
+def callFunc(func_name, *args, **kwargs):
+ """
+ Calls a function by its name if it exists in the global scope.
+
+ :param func_name: (string) name of the function to call
+ :param args: (List<string>) Positional arguments to pass to the function
+ :param kwargs: (List<tuple<string>>) Keyword arguments to pass to the function
+ :return: The result of the function call if successful, None otherwise
+ """
+ #TODO keep a log of called functions and cached result, to use for optimization
+ if func_name in globals() and callable(globals()[func_name]):
+ func = globals()[func_name]
+ return func(*args, **kwargs)
+ else:
+ print(f'function "{func_name}" not callable')
+ return None
+
+
+if __name__ == '__main__':
+ '''
+ older less-dynamic implementation
+ parser = argparse.ArgumentParser(description='a tool to analyze and manage files in a directory structure')
+ parser.add_argument('func', help='module function to call')
+ parser.add_argument('args', nargs='*', help='function args')
+ args = parser.parse_args()
+
+ if args.func == 'compareDupesOfFiles':
+ if len(args.args < 2):
+ print('need (dir1, dir2)')
+ sys.exit(1)
+ result = compareDupesOfFiles(args.args[0], args.args[1])'''
+ if len(sys.argv) == 1:
+ print('a tool for traversing a directory structure and operating on its files. \
+ the directory structure is a tree, where each leaf is a file (or symlink pointing to the memory address of another directory node). \
+ so it can be stored in memory as a tree datastructure. ')
+ sys.exit(0)
+ if len(sys.argv) == 2:
+ callFunc(sys.argv[1])
+ elif len(sys.argv) > 2:
+ args = [] #separate out normal args and keyword args
+ kwargs = []
+ for a in sys.argv[2:]:
+ if '=' in a:
+ tmp = a.split('=')
+ if len(tmp) != 2:
+ print(f'invalid: {a}')
+ sys.exit(1)
+ kwargs.append([tmp[0],tmp[1]])
+ else:
+ args.append(a)
+ #print(f'calling {sys.argv[1]}({",".join(args)} {kwargs})')
+ callFunc(sys.argv[1], *args, *kwargs)
diff --git a/ogrep b/ogrep
index f3d2aa9..cba02c2 100755
--- a/ogrep
+++ b/ogrep
@@ -5,11 +5,9 @@ if [[ -z $1 || -z $2 ]]; then
echo "usage: ogrep [pattern] [file(s)]"
exit 0
fi
-for f in `ls ${2}`; do
- if [[ ! ${f} == *".odt" ]]; then
- continue
+for f in `find ${2} -type f -name "*odt"`; do
+ res=`odt2txt ${f} | grep ${1}`
+ if [[ $res ]]; then
+ echo "${f}: ${res}"
fi
- echo "got " ${f}
- echo "${f}: "
- odt2txt ${f} | grep ${1}
done