"""
find_duplicate_code -- Find similar code fragments
"""
import sys, os, locale, re, string, time
VERSION = '0.3'
MIN_PYTHON_VERSION = 2.4
opt_abstract_scalars = False
opt_abstract_symbols = False
opt_treshold = 4
opt_similarity = 0.95 opt_verbose = 0
opt_dry_run = 0
opt_amalgam = 0
opt_known_globs = ('*.[cChH]', '*.cs', '*.c[px][px]',
'*.java', '*.js',
'*.p[lym]',
'*.txt',
'*.sh', '*.el', '*.inc',
'*.vbs'
)
class Document:
"""Represents a source file to be compared with others."""
def __init__(self, f):
if f == '-' or f is None:
self.pathname = f
self.dirname = None
else:
assert isinstance(f, str)
f = os.path.abspath(f)
if os.path.isdir(f):
raise Exception(f + " is a directory")
if not os.path.exists(f):
raise Exception(f + " not found")
self.pathname = os.path.realpath(f) self.dirname = os.path.dirname(self.pathname) self.pieces = None self.lines = [] self.tokens = []
def Slurp(self):
"""Read all input lines."""
if self.pathname == '-':
self.slurped = [ line.strip() for line in sys.stdin.readlines() ]
else:
if not os.path.exists(self.pathname):
raise Exception(self.pathname + " cannot be opened for reading")
self.slurped = [ line.strip() for line in open(self.pathname, 'r').readlines() ]
def Parse(self):
"""Split input lines to words. Split at spaces, preserving quoted
strings. When ABSTRACT_SCALARS is true insert abstractions for scalars
too: NUMBER(<value>) and STRING(<value>)"""
self.pieces = []
self.lines = []
for l in self.slurped:
p = []
if len(l):
fragments = [ s for s in re.split("( |\\\".*?\\\"|'.*?')", l) if l.strip() ]
words = [ ]
for s in fragments:
words.append(s)
for w in words:
if not len(w):
continue
if len(w) == 1 and w[0] == ' ':
continue
if opt_abstract_scalars:
if w[0] == '"' or w[0] == "'":
w = 'STRING'
if opt_abstract_symbols:
if self.language:
pass p.append(w)
self.pieces.append(p)
self.lines.append(string.join(p,''))
class DocumentSet:
"""Represents all unique input files."""
def __init__(self):
self.inputfiles = []
self.documents = [] self.outfile = '-' self.home = os.environ.get('HOME')
if self.home is not None:
self.config_filename = os.path.join(self.home, '.config-file')
def SetInput(self, f):
"""Find all concrete filenames in F. F can be a list, tuple, directory-name or filename."""
if isinstance(f, list) or isinstance(f, tuple): for i in f:
self.SetInput(i)
else:
if isinstance(f, str):
if os.path.isdir(f):
if opt_verbose:
print os.path.abspath(f) + ": found directory"
sys.stdout.flush
import glob for x in [ y for y in os.listdir(f) ]:
x = os.path.join(f, x)
if os.path.isdir(x):
self.SetInput(x)
else:
self.SetInput([ glob.glob(os.path.join(f, x)) for x in opt_known_globs ])
else: if opt_verbose:
print os.path.abspath(f) + ": found source file"
sys.stdout.flush
self.inputfiles.append(f)
else: self.inputfiles.append('-')
def SetOutput(self, f):
if isinstance(f, str):
self.outfile = os.path.abspath(f)
else: self.outfile = '-'
def CompileDocuments(self):
"""Create `Document' per input files."""
self.documents = []
for f in self.inputfiles:
self.documents.append(Document(f))
def symbolize(s):
"""Drop non-symbol characters and convert to lowercase."""
return re.sub(r'(?u)[^\w\-_]', '', s).lower()
def is_array(obj):
"""Return True if OBJ is list or tuple type."""
return isinstance(obj, list) or isinstance(obj, tuple)
def strip_list(s):
"""Return list with empty items from start and end of list removed."""
for i in range(len(s)): if s[i]: break
else: return []
s = s[i:]
for i in range(len(s) -1, -1, -1): if s[i]: break
else: return []
return s[:i+1]
def uniqify(l):
"""Remove duplicates from a list."""
return list(set(l))
def join_string_list(lines1, lines2):
"""
Append list or tuple of strings LINES2 to list LINES1. Join the last
non-blank item in 'lines1' with the first non-blank item in LINES2 into a
single string.
"""
assert is_array(lines1)
assert is_array(lines2)
lines1 = strip_list(lines1)
lines2 = strip_list(lines2)
if not lines1 or not lines2:
return list(lines1) + list(lines2)
result = list(lines1[:-1])
result.append(lines1[-1] + lines2[0])
result += list(lines2[1:])
return result
def time_string(t):
"""Convert seconds since the Epoch to formatted local time string."""
t = time.localtime(t)
s = time.strftime('%H:%M:%S',t)
if time.daylight:
result = s + ' ' + time.tzname[1]
else:
result = s + ' ' + time.tzname[0]
try:
result = char_encode(result.decode(locale.getdefaultlocale()[1]))
except Exception:
pass
return result
def date_string(t):
"""Convert seconds since the Epoch to formatted local date string."""
t = time.localtime(t)
return time.strftime('%Y-%m-%d',t)
def file_exists(fname, dirname):
"""True if file FNAME resides inside dirname."""
assert os.path.isfile(fname)
if dirname == '':
dirname = os.getcwd()
else:
assert os.path.isdir(dirname)
dirname = os.path.realpath(dirname)
fname = os.path.realpath(fname)
return os.path.commonprefix((dirname, fname)) == dirname
def find_duplicate_code_ranges(D):
"""Find duplicate lines in a `Document' D. Returns list of duplicates or
`None'."""
assert opt_treshold > 1
R = []
i = j = 0
end = len(D.lines)
chunk_num = 0
while i < end: a = D.lines[i]
chunk_size = 0
if len(a):
k = i
j = i + 1
c = []
count = 0
while j < end:
b = D.lines[j]
if not len(b): count += 1
j += 1
elif b == a:
c.append(D.slurped[k])
count += 1
k += 1 a = D.lines[k]
j += 1
else: if len(c) >= opt_treshold:
if not chunk_size:
chunk_size = len(c)
chunk_num += 1
print " chunk %u, %u-%u (%u)" % (chunk_num, i + 1, k, chunk_size)
for l in range(i, k):
print "% 9u: %s" % (l + 1, c[l - i])
start = j - count + 1
print " duplicated at lines %u-%u (%u)" % (start, j, count)
else:
j += 1
k = i
a = D.lines[k]
c = []
count = 0
i += max(1, chunk_size)
if chunk_size:
assert chunk_size >= opt_treshold
else:
i += 1
if len(R):
return R
return None
class Usage(Exception):
def __init__(self, msg):
self.msg = msg
Ds = DocumentSet()
def main(cmd, opts, rawfilenames):
"""
Execute CMD with command-line options and arguments. OPTS and FILES conform
to getopt-values.
"""
global Ds
if not os.path.exists(cmd):
raise Exception("Non-existing command '%s'" % cmd)
for k, v in opts:
if k in ('-o', '--output'):
Ds.SetOutput(v)
if not isinstance(v, str):
sys.stdout = v rawfilenames = uniqify(rawfilenames)
Ds.SetInput(rawfilenames)
for v in Ds.inputfiles:
if not isinstance(v, str):
sys.stdin = v Ds.CompileDocuments()
if opt_verbose:
t = time.time()
sys.stdout.flush
for D in Ds.documents:
print "%s: reading" % (D.pathname)
sys.stdout.flush
D.Slurp()
D.Parse()
for D in Ds.documents:
pass
for D in Ds.documents:
print '%s: %u line(s)' % (D.pathname, len(D.lines))
sys.stdout.flush
if not opt_dry_run:
find_duplicate_code_ranges(D)
sys.stdout.flush
return 0
if __name__ == '__main__':
import getopt
R = 0
if float(sys.version[:3]) < MIN_PYTHON_VERSION:
message.stderr('FAILED: Python 2.4+ required')
sys.exit(1)
else:
stdin, stdout = sys.stdin, sys.stdout
try:
try:
cmd = sys.argv[0]
opts, rest = getopt.getopt(sys.argv[1:],
'ht:nvo:',
['help', 'dry', 'verbose', 'version',
'abstract-scalars', 'abstract-symbols', 'amalgam',
'treshold=',
'output='])
except getopt.GetoptError, msg:
raise Usage(msg)
for k in [opt[0] for opt in opts]:
if k in ('--help', '-h'):
raise Usage("No man-page available")
elif k in ('--version', '-v'):
print(VERSION)
exit(0)
else:
opt_dry_run = k in ('--dry', '-n')
opt_verbose = k in ('--verbose')
opt_amalgam = k in ('--amalgam')
opt_abstract_scalars = k in ('--abstract-scalars')
opt_abstract_symbols = k in ('--abstract-symbols')
for k, v in opts:
if k in ('--treshold', '-t'):
opt_treshold = int(v)
if len(rest) == 0:
raise Usage('Missing file argument(s)')
if opt_abstract_symbols:
raise Usage('Sorry, "--abstract-symbols" not yet implemented in this version')
if opt_amalgam:
raise Usage('Sorry, "--amalgam" not yet implemented in this version')
stdout.flush
try: R = main(cmd, opts, rest)
except KeyboardInterrupt: R = 1
except Usage, err:
print >>sys.stderr, """
%(msg)s
Usage:
%(cmd)s [OPTION]... PATHNAME...
Options:
--abstract-scalars unify numbers and strings in input text (default: no)
-t N, --treshold=N set the min. # for duplicate lines (default: 4)
-n, --dry-run do not actually compare, only find and read files
-v, --verbose
--help, --version print help or version information
Description:
Duplicate code lines are only found per file unless the `--amalgam' option is
specified. Each PATHNAME can be a directory- or filename. In case of a
directory-name finds all known source files in the directory.
Known source files:
%(globs)s
Examples:
Find duplicate code tracks in all files of the current directory
$ find_duplicate_code *
Find duplicate code in all known source files
$ find_duplicate_code /path/to/sources
Analyze C++-files
$ find /path/to/c++files -type f -regex '^.+\(cpp\|h\)$' | \\
xargs find_duplicate_code --treshold=5
""" % {"cmd": cmd,
"msg": err.msg,
"globs": opt_known_globs}
R = 2
finally:
sys.stdin, sys.stdout = stdin, stdout
if R == 0:
print "OK(%d)" % (R)
elif R < 0:
print "FAILED(%d)" % (R)
sys.exit(R)