#!/usr/bin/python """ find_duplicate_code -- Find similar code fragments """ ## ## Usage: ## find_duplicate_code [OPTION]... PATHNAME... ## ## Options: ## --abstract-scalars unify numbers and strings in input text (default: no) ## --abstract-symbols unify identifier names in input text (default: no) ## --amalgam build a big file from all known source files to ## compare each file with each other file ## -t N, --treshold=N set the min. number of duplicate lines (default: 4) ## -o F, --output=F write log to file F instead of stdout ## -n, --dry-run do not actually compare, only find and read files ## --help, --version print help or version information ## ## Example: Find duplicate code in known source files ## ## $ find_duplicate_code /path/to/sources ## ## Example: Find duplicates in all files of the working directory ## ## $ find_duplicate_code * ## ## Example: Find duplicate code lines in C++-files ## ## $ find /path/to/c++files -type f -regex '^.+\(cpp\|h\)$' | \ ## xargs find_duplicate_code --treshold=5 ## ## Example: Output ## ## Here find_duplicate_code has examined file libutils.sh. Three consecutive ## lines of code ("chunks") have been found that are duplicate at least once. ## Chunks 1 and 3 are duplicated once, chunk 3 is duplicated twice. As you can ## see the output is structured and can be processed further. ## ## libutils.sh: 1180 line(s) ## chunk 1, 215-218 (4) ## 215: local tpath="${path%/}" ## 216: while [ "${tpath}" != "${path}" ]; do ## 217: tpath="${path}"; path="${tpath%/}" ## 218: done ## duplicated at lines 231-234 (4) ## chunk 2, 224-226 (3) ## 224: { # ## 225: # @param PATH ## 226: # ## duplicated at lines 248-250 (3) ## duplicated at lines 259-261 (3) ## chunk 3, 248-250 (3) ## 248: { # ## 249: # @param PATH ## 250: # ## duplicated at lines 259-261 (3) ## ## Notes: ## ## The comparison algorithm is line-based and is described in section ## "ALGORITHM" below. ## ## This version of the script does not yet implement "--amalgam" and ## "--abstract-symbols". ## ######################### ## $Author: Andreas Spindler $ ## $ Writestamp: 2010-06-08 17:35:31$ ## $Maintained at: http://www.visualco.de$ import sys, os, locale, re, string, time VERSION = '0.3' MIN_PYTHON_VERSION = 2.4 # least Python version required opt_abstract_scalars = False opt_abstract_symbols = False opt_treshold = 4 opt_similarity = 0.95 # how similar two blocks of code must be declared opt_verbose = 0 opt_dry_run = 0 opt_amalgam = 0 opt_known_globs = ('*.[cChH]', '*.cs', '*.c[px][px]', '*.java', '*.js', '*.p[lym]', '*.txt', '*.sh', '*.el', '*.inc', '*.vbs' ) ######################################################################## ## ## DOMAIN-LEVEL TYPES ## ######################################################################## class Document: """Represents a source file to be compared with others.""" def __init__(self, f): if f == '-' or f is None: self.pathname = f self.dirname = None else: assert isinstance(f, str) f = os.path.abspath(f) if os.path.isdir(f): raise Exception(f + " is a directory") if not os.path.exists(f): raise Exception(f + " not found") self.pathname = os.path.realpath(f) # absolute, real (existing) pathname self.dirname = os.path.dirname(self.pathname) # absolute, real directory name self.pieces = None # lines -> pieces strings split self.lines = [] # pieces joind to abstract lines self.tokens = [] # lines -> token stream def Slurp(self): """Read all input lines.""" if self.pathname == '-': self.slurped = [ line.strip() for line in sys.stdin.readlines() ] else: if not os.path.exists(self.pathname): raise Exception(self.pathname + " cannot be opened for reading") self.slurped = [ line.strip() for line in open(self.pathname, 'r').readlines() ] def Parse(self): """Split input lines to words. Split at spaces, preserving quoted strings. When ABSTRACT_SCALARS is true insert abstractions for scalars too: NUMBER() and STRING()""" # Note that neither module `shlex' nor `csv' keep quoted characters, # hence the below regular expression. It does not handle \" properly # (e.g. test"\"file" or \"test"). With backreferences it might by # faster. self.pieces = [] self.lines = [] for l in self.slurped: p = [] if len(l): fragments = [ s for s in re.split("( |\\\".*?\\\"|'.*?')", l) if l.strip() ] words = [ ] for s in fragments: # Fragments are lines splitted at quoted strings. # TODO: Split each fragment `s' further at whitespaces to # get a word list. words.append(s) for w in words: if not len(w): continue if len(w) == 1 and w[0] == ' ': continue if opt_abstract_scalars: if w[0] == '"' or w[0] == "'": w = 'STRING' if opt_abstract_symbols: if self.language: pass # TODO: from the word list change each name # that is not a keyword to "identifier". It is # therefore necessay to know the language of # the input text. p.append(w) self.pieces.append(p) self.lines.append(string.join(p,'')) class DocumentSet: """Represents all unique input files.""" def __init__(self): self.inputfiles = [] self.documents = [] # path -> `Document' self.outfile = '-' # where to write the result self.home = os.environ.get('HOME') if self.home is not None: self.config_filename = os.path.join(self.home, '.config-file') def SetInput(self, f): """Find all concrete filenames in F. F can be a list, tuple, directory-name or filename.""" if isinstance(f, list) or isinstance(f, tuple): # flatten lists/tuples for i in f: self.SetInput(i) else: if isinstance(f, str): if os.path.isdir(f): # Recursively read known sources files (as defined by # `opt_known_globs'). if opt_verbose: print os.path.abspath(f) + ": found directory" sys.stdout.flush import glob # http://docs.python.org/library/glob.html for x in [ y for y in os.listdir(f) ]: x = os.path.join(f, x) if os.path.isdir(x): self.SetInput(x) else: self.SetInput([ glob.glob(os.path.join(f, x)) for x in opt_known_globs ]) else: # `f' is not a directory. if opt_verbose: print os.path.abspath(f) + ": found source file" sys.stdout.flush self.inputfiles.append(f) else: # assume file object (stdin) self.inputfiles.append('-') def SetOutput(self, f): if isinstance(f, str): self.outfile = os.path.abspath(f) else: # assume file object or None self.outfile = '-' def CompileDocuments(self): """Create `Document' per input files.""" self.documents = [] for f in self.inputfiles: self.documents.append(Document(f)) ######################################################################## ## ## UTILITY FUNCTIONS ## ######################################################################## def symbolize(s): """Drop non-symbol characters and convert to lowercase.""" return re.sub(r'(?u)[^\w\-_]', '', s).lower() def is_array(obj): """Return True if OBJ is list or tuple type.""" return isinstance(obj, list) or isinstance(obj, tuple) def strip_list(s): """Return list with empty items from start and end of list removed.""" for i in range(len(s)): # strip from head if s[i]: break else: return [] s = s[i:] for i in range(len(s) -1, -1, -1): # strip from tail if s[i]: break else: return [] return s[:i+1] def uniqify(l): """Remove duplicates from a list.""" return list(set(l)) def join_string_list(lines1, lines2): """ Append list or tuple of strings LINES2 to list LINES1. Join the last non-blank item in 'lines1' with the first non-blank item in LINES2 into a single string. """ assert is_array(lines1) assert is_array(lines2) lines1 = strip_list(lines1) lines2 = strip_list(lines2) if not lines1 or not lines2: return list(lines1) + list(lines2) result = list(lines1[:-1]) result.append(lines1[-1] + lines2[0]) result += list(lines2[1:]) return result def time_string(t): """Convert seconds since the Epoch to formatted local time string.""" t = time.localtime(t) s = time.strftime('%H:%M:%S',t) if time.daylight: result = s + ' ' + time.tzname[1] else: result = s + ' ' + time.tzname[0] # Attempt to convert the localtime to the output encoding. try: result = char_encode(result.decode(locale.getdefaultlocale()[1])) except Exception: pass return result def date_string(t): """Convert seconds since the Epoch to formatted local date string.""" t = time.localtime(t) return time.strftime('%Y-%m-%d',t) def file_exists(fname, dirname): """True if file FNAME resides inside dirname.""" assert os.path.isfile(fname) # Empty dirname (not to be confused with None) is the current dirname. if dirname == '': dirname = os.getcwd() else: assert os.path.isdir(dirname) dirname = os.path.realpath(dirname) fname = os.path.realpath(fname) return os.path.commonprefix((dirname, fname)) == dirname ######################################################################## ## ## ALGORITHM ## ######################################################################## def find_duplicate_code_ranges(D): """Find duplicate lines in a `Document' D. Returns list of duplicates or `None'.""" # # Algorithm: # # for i = first to last line # a = line indexed by i # if a not empty # k = i # for j = i + 1 to last line # b = line indexed by j # if b not empty # if a == b # push original text indexed by k to c # k = k + 1 # a = line indexed by k # else # if size of c >= threshold # print information # k = i (reset to first line to be matched) # clear c # # Notes: # # Possibly this algorithm has been found earlier; I don't know. It reads # input files line by line. For an exact analysis, however, I think it is # more efficient to compare token streams. The related algorithm then: # # - In functions, find statements (if, while, for etc.) # # - Ignore local variables and local typedefs # # - Let each statement form the root of an abstract syntax tree # # - When comparing functions compare the set of AST's per function # # This algorithm is more efficient, since it results in less "false # positives" than when comparing line-by-line. Miminizing false positives, # becomes relevant when comparing millions of code lines. # assert opt_treshold > 1 R = [] i = j = 0 end = len(D.lines) chunk_num = 0 while i < end: # from top to down a = D.lines[i] chunk_size = 0 #print "% 5u: *** '%s'" % (i + 1, D.lines[i]) if len(a): k = i j = i + 1 c = [] count = 0 while j < end: b = D.lines[j] # if chunk_size: # print "j=%d k=%d b='%s' a = '%s'" %(j, k, b, a) if not len(b): # ingore empty lines count += 1 j += 1 elif b == a: c.append(D.slurped[k]) count += 1 k += 1 # advance to next line to match a = D.lines[k] j += 1 else: # some non-empty line did not match if len(c) >= opt_treshold: if not chunk_size: chunk_size = len(c) chunk_num += 1 print " chunk %u, %u-%u (%u)" % (chunk_num, i + 1, k, chunk_size) for l in range(i, k): print "% 9u: %s" % (l + 1, c[l - i]) start = j - count + 1 print " duplicated at lines %u-%u (%u)" % (start, j, count) else: j += 1 ## Back to first line that must match. k = i a = D.lines[k] c = [] count = 0 ## Jump over the minimum number of matched lines or goto next line. ## print " compared line %u" % (i + 1) i += max(1, chunk_size) if chunk_size: assert chunk_size >= opt_treshold # print " continuing at line %u" else: i += 1 if len(R): return R return None ######################################################################## ## ## MAIN CODE ## ######################################################################## class Usage(Exception): def __init__(self, msg): self.msg = msg Ds = DocumentSet() def main(cmd, opts, rawfilenames): """ Execute CMD with command-line options and arguments. OPTS and FILES conform to getopt-values. """ # Locate the executable and configuration files directory. global Ds if not os.path.exists(cmd): raise Exception("Non-existing command '%s'" % cmd) # Application-related options and input files. Throws unless a file exists. for k, v in opts: if k in ('-o', '--output'): Ds.SetOutput(v) if not isinstance(v, str): sys.stdout = v # bind stdout to file object rawfilenames = uniqify(rawfilenames) Ds.SetInput(rawfilenames) for v in Ds.inputfiles: if not isinstance(v, str): sys.stdin = v # bind stdin to file object Ds.CompileDocuments() # Do the work: # (1) slurp input files, create `Document' objects # (2) split lines at words into strings # (3) remove comments, optimize lines # (4) find duplicates if opt_verbose: t = time.time() ## print time_string(t), date_string(t) ## print 'input paths = %s' % (string.join(Ds.inputfiles)) ## print 'output log = %s' % (Ds.outfile) sys.stdout.flush # Step 1-2 for D in Ds.documents: print "%s: reading" % (D.pathname) sys.stdout.flush D.Slurp() D.Parse() # Step 3 for D in Ds.documents: pass # TODO: when a line contains no [A-Za-z0-9"'] assume it contains # operators/gibberish/braces. Then concat it with previous line # and keep this line empty (so that the indeces of `D.slurped' and # `D.lines' are still identical). # Step 4 for D in Ds.documents: print '%s: %u line(s)' % (D.pathname, len(D.lines)) sys.stdout.flush if not opt_dry_run: find_duplicate_code_ranges(D) sys.stdout.flush return 0 if __name__ == '__main__': # Parse command-line options, then process trivial options, then `main()'. import getopt R = 0 if float(sys.version[:3]) < MIN_PYTHON_VERSION: message.stderr('FAILED: Python 2.4+ required') sys.exit(1) else: stdin, stdout = sys.stdin, sys.stdout try: try: cmd = sys.argv[0] opts, rest = getopt.getopt(sys.argv[1:], 'ht:nvo:', ['help', 'dry', 'verbose', 'version', 'abstract-scalars', 'abstract-symbols', 'amalgam', 'treshold=', 'output=']) except getopt.GetoptError, msg: raise Usage(msg) for k in [opt[0] for opt in opts]: if k in ('--help', '-h'): raise Usage("No man-page available") elif k in ('--version', '-v'): ## print(cmd + ' ' + VERSION) print(VERSION) exit(0) else: opt_dry_run = k in ('--dry', '-n') opt_verbose = k in ('--verbose') opt_amalgam = k in ('--amalgam') opt_abstract_scalars = k in ('--abstract-scalars') opt_abstract_symbols = k in ('--abstract-symbols') for k, v in opts: if k in ('--treshold', '-t'): opt_treshold = int(v) if len(rest) == 0: raise Usage('Missing file argument(s)') if opt_abstract_symbols: raise Usage('Sorry, "--abstract-symbols" not yet implemented in this version') if opt_amalgam: raise Usage('Sorry, "--amalgam" not yet implemented in this version') stdout.flush try: R = main(cmd, opts, rest) except KeyboardInterrupt: R = 1 except Usage, err: print >>sys.stderr, """ %(msg)s Usage: %(cmd)s [OPTION]... PATHNAME... Options: --abstract-scalars unify numbers and strings in input text (default: no) -t N, --treshold=N set the min. # for duplicate lines (default: 4) -n, --dry-run do not actually compare, only find and read files -v, --verbose --help, --version print help or version information Description: Duplicate code lines are only found per file unless the `--amalgam' option is specified. Each PATHNAME can be a directory- or filename. In case of a directory-name finds all known source files in the directory. Known source files: %(globs)s Examples: Find duplicate code tracks in all files of the current directory $ find_duplicate_code * Find duplicate code in all known source files $ find_duplicate_code /path/to/sources Analyze C++-files $ find /path/to/c++files -type f -regex '^.+\(cpp\|h\)$' | \\ xargs find_duplicate_code --treshold=5 """ % {"cmd": cmd, "msg": err.msg, "globs": opt_known_globs} R = 2 finally: sys.stdin, sys.stdout = stdin, stdout if R == 0: print "OK(%d)" % (R) elif R < 0: print "FAILED(%d)" % (R) sys.exit(R) ### find_duplicate_code ends here