#!/usr/bin/env python

# This script counts the lines of code in various types of source files.
# It has options to print summaries at various levels.
#
# It can be used with python version 2 and 3.

# todo:
# - possibly specify/override/add files/dirs to be ignored on command line

import re
import os
import sys
import stat
import time
import argparse

# Define the various file types and comment marker for single line and multi line.
# It defines the following fields:
#  - The file type
#  - A list of matching file name extensions
#  - None or a regex defining a matching file name extension pattern
#  - A list of matching file names
#  - The delimiter defining a comment in a single line
#  - The delimiters defining the start and end of a block comment
#  - Indication if file contains code or other info (1=code, 0=other)
# Similar to cloc there are a few theoretical problems:
# 1. If a quoted string contains comment delimiters, they are recognized
#    as comment delimiters. In principle some regexes could be defined to replace
#    such strings, possibly after first replacing escaped backslashes and quotes.
#    This is quite some work for cases that do not occur in practice.
# 2. A regex like '""".*"""' is greedy, thus a line like """some1"""some2"""
#    is fully matched. In practice such lines are not used.
# See cnt.py-new for an attempt solving these issues.
types = [ ('C++', ['cc', 'tcc', 'hcc', 'cpp', 'cxx'], None, [], '//', '/*', '*/', 1),
          ('C++Hdr', ['h', 'hpp', 'hxx'], None, [], '//', '/*', '*/', 1),
          ('C', ['c'], None, [], '//', '/*', '*/', 1),
          ('Cuda', ['cu'], None, [], '//', '/*', '*/', 1),
          ('OpenCL', ['cl'], None, [], '//', '/*', '*/', 1),
          ('Fortran', ['f', 'for'], None, [], '*', '', '', 1),
          ('Assembly', ['m', 'S'], None, [], '', '', '', 1),
          ('Lisp', ['lisp'], None, [], '', '', '', 1),
          ('SQL', ['sql'], None, [], '--', '', '', 1),
          ('TaQL', ['taql'], None, [], '#', '', '', 1),
          ('Flex', ['l', 'll'], None, [], '//', '/*', '*/', 1),
          ('Bison', ['y', 'yy'], None, [], '//', '/*', '*/', 1),
          ('Python', ['py', 'python', 'python3'], None, [], '#', '"""', '"""', 1),
          ('Perl', ['pl', 'perl'], None, [], '#', '', '', 1),
          ('test-run', ['run'], None, [], '','','', 0),
          ('test-in', ['in'], re.compile('in_.*'), [], '','','', 0),
          ('test-out', ['out', 'stdout'], None, [], '','','', 0),
          ('sh', ['sh'], None, [], '#', '', '', 0),
          ('bash', ['bash'], None, [], '#', '', '', 0),
          ('csh', ['csh'], None, [], '#', '', '', 0),
          ('tcsh', ['tcsh'], None, [], '#', '', '', 0),
          ('CMake', ['cmake'], None, ['CMakeLists.txt'], '#', '', '', 0),
          ('Config', ['conf', 'cfg', 'dat'], None, [], '#', '', '', 0),
          ('Component', ['comp'], None, [], '#', '', '', 0),
          ('parset', ['parset'], re.compile('parset.*'), [], '#', '', '', 0),
          ('log_prop', ['log_prop'], None, [], '#', '', '', 0),
          ('rst', ['rst'], None, [], '', '', '', 0),
          ('doxygen', ['dox'], None, ['doxygen.cfg'], '', '', '', 0),
          ('xml', ['xml', 'xsl', 'xsd'], None, [], '', '', '', 0),
          ('html', ['html', 'htm'], None, [], '', '', '', 0),
          ('binary', [], None, [], '', '', '', 0),               # binary files
          ('ignore', ['log', 'shar', 'tmp', 'ps', 'fig','omt'], re.compile('.*(~|-sav|-new|info|params|sed|md)'), ['templates','makefile','changelog','.gitignore','.travis.yml'], '', '', '', 0),  # files to be ignored
          ('unknown', [], None, [], '', '', '', 0) ]


def showTypes (verbose):
    for (type,exts,extre,filenms,comm,scomm,ecomm,ctyp) in types:
        print ('%-24s code=%d' % (type,ctyp))
        print ('  file name extensions: ', exts)
        if verbose:
            if not extre is None:
                print ('  extension pattern:    ', extre.pattern)
            if len(filenms) > 0:
                print ('  file names:           ', filenms)
            if len(comm) > 0:
                print ('  comment marker:       ', comm)
            if len(scomm) > 0:
                print ('  start comment block:  ', scomm)
                print ('  end   comment block:  ', ecomm)


# Define regex for a line containing an alphanumeric character
reAlphaNum  = re.compile('\w')

def hasAlphaNum (line):
    l = line.strip()
    return len(l) > 0  and  reAlphaNum.search(line)


# From http://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python
# Might treat UTF-16 files also as binary.
def is_textfile(filename):
    fin = open(filename, 'rb')
    try:
        CHUNKSIZE = 4096
        while 1:
            chunk = fin.read(CHUNKSIZE)
            if sys.version_info.major == 2:
                if '\0' in chunk:
                    return False
            else:
                if 0 in chunk:
                    return False
            if len(chunk) < CHUNKSIZE:
                break
    finally:
        fin.close()
    return True

def add_escape(str):
    ''' Add an escape character for special regex characters'''
    out = ''
    for c in str:
        if c in '.*[]|':
            out += '\\'
        out += c
    return out
    
    
# Return tuple with nr of files, nr of lines, nr of code lines,
# nr of comment lines, nr of blank lines, and nr of header lines.
#  linecomm: comment marker for a single line
#  scomm:    start block comment marker (empty is no block comments)
#  ecomm:    end block comment marker
#  basic:    False = only count lines with >= 1 alphanumeric char and
#                    count header separately
def countcodecomm (filename, linecomm, scomm='', ecomm='', basic=False):
    f = open(filename)
    nhdr   = 0
    nblank = 0
    ncomm  = 0
    ncode  = 0
    nline  = 0
    skipHeader = not basic
    blockComm = False
    if len(scomm) > 0:
        scomm_esc = add_escape(scomm)
        ecomm_esc = add_escape(scomm)
        reComm2a = re.compile('\s*' + scomm_esc + '\s*' + ecomm_esc + '\s*')
        reComm2b = re.compile('\s*' + scomm_esc + '(.*)' + ecomm_esc + '\s*')
        reSComm = re.compile(scomm_esc)
        reEComm = re.compile(ecomm_esc)
        reTillSComm = re.compile('.*' + scomm_esc + '\s*')
        reTillEComm = re.compile('.*' + ecomm_esc + '\s*')
        reFromSComm = re.compile('\s*' + scomm_esc + '.*')
        reFromEComm = re.compile('\s*' + ecomm_esc + '.*')
    # Loop over all lines in the file.
    for line in f:
        nline += 1
        # Skip file header till first non-comment line.
        # This header is usually the licensing info.
        if skipHeader:
            if len(linecomm) > 0  and  line[:len(linecomm)] == linecomm:
                nhdr += 1
                continue
            skipHeader = False
        # Remove leading and trailing whitespace (including newline)
        line = line.strip()
        if len(line) == 0:
            nblank += 1
        else:
            # Handle lines in a block comment.
            if blockComm:
                if reEComm.search (line):
                    # End of block comment
                    blockComm = False
                    # Remove the part until the comment marker.
                    # If nothing left, it can be a comment line.
                    l1 = reTillEComm.sub ('', line)
                    if len(l1) == 0  or  (not basic and not hasAlphaNum(l1)):
                        if basic  or  hasAlphaNum(reFromEComm.sub ('', line)):
                            ncomm += 1
                        continue
                    line = l1
                else:
                    # A line inside a block comment
                    # Only count as comment if an alphanumeric in it
                    if basic  or  hasAlphaNum(line):
                        ncomm += 1
                    continue
            # Test for start of block comment.
            hasBlCom = False
            if len(scomm) > 0:
                # Remove empty block comments on a single line.
                l1 = reComm2a.sub ('', line)
                if len(l1) == 0:
                    # If nothing left, the line contains 'scomm ecomm' only.
                    if basic:
                        ncomm += 1
                    continue
                # Remove non-empty block comments on a single line.
                l1 = reComm2b.sub ('', l1)
                if l1 != line:
                    hasBlCom = hasAlphaNum(reComm2b.sub (r'{\1}', line))
                    if len(l1) == 0:
                        # Nothing left; count if appropriate.
                        if basic  or  hasBlCom:
                            ncomm += 1
                        continue
                line = l1
                # Check for the start of a block comment
                if reSComm.search (line):
                    blockComm = True
                    # Remove the part past the comment marker.
                    # If nothing left, it can be a comment line.
                    l1 = reFromSComm.sub ('', line)
                    if len(l1) == 0:
                        if basic  or  hasBlCom  or  hasAlphaNum(reTillSComm.sub ('', line)):
                            ncomm += 1
                        continue
                    line = l1
            # A code or a single comment line
            # Count if it contains an alphanumeric character.
            if basic  or  hasAlphaNum(line):
                if len(linecomm) > 0  and  line[:len(linecomm)] == linecomm:
                    ncomm += 1
                else:
                    ncode += 1
            elif hasBlCom:
                # Also count as comment if there was a comment block.
                ncomm += 1
    return (1, nline, ncode, ncomm, nblank, nhdr)

def printHeader():
    sys.stdout.write ('%9s%7s%9s%9s%16s%16s%9s%9s\n' % ('Type','Files','Lines','Code','Comment','Blank','Header','Other'))
    
def printCount(file, type, cnt, ccperc):
    perc = [0.,0.]
    t = cnt[1]
    if ccperc:
        t = cnt[2] + cnt[3]    # code + comment
    if t > 0:
        for i in (0,1):
            perc[i] = 100. * cnt[i+2] / t
        file.write ('%9s %6d %8d %8d %5.1f%% %8d %5.1f%% %8d %8d %8d\n' % (type, cnt[0], cnt[1], cnt[2], perc[0], cnt[3], perc[1], cnt[4], cnt[5], cnt[1]-cnt[2]-cnt[3]-cnt[4]-cnt[5]))
    else:
        if cnt[0] > 0:
            file.write ('%9s %6d\n' % (type, cnt[0]))

# Count another file.
# If present, use the shebang to derive the file type.
# Otherwise count it as unknown.
def countother(filename, basic, usecode):
    f = open(filename)
    nline  = 0
    nblank = 0
    # Test first line for shebang.
    for line in f:
        if line[:2] == '#!':
            # Remove shebang, whitespace and comment.
            line = line[2:].strip()
            recomm = re.compile('#.*')
            line = recomm.sub('', line)
            # Remove till last slash and optionally env.
            rescr = re.compile('.*/')
            reenv = re.compile('env\s\s*')
            line = rescr.sub('', line)
            ext = reenv.sub('', line).lower()
            # Count a known file type
            for (type,exts,extre,filenms,comm,scomm,ecomm,ctyp) in types:
                if ext in exts:
                    if usecode and ctyp==0:
                        return (type, ctyp, (1,0,0,0,0,0))
                    return (type, ctyp, countcodecomm (filename,comm,scomm,ecomm,basic))
        line = line.strip()
        if len(line) == 0:
            nblank += 1
        nline += 1
    # Unknown file type, nothing to be counted.
    if usecode:
        return ('unknown', 0, (1,0,0,0,0,0))
    return ('unknown', 0, (1,nline,0,0,nblank,0))

def countfiles(dirname, test, basic, ccperc, verbose, printlevel, level, usecode, dosum, warn_unknown):
    sums = [{}, {}]
    for t in types:
        sums[0][t[0]] = [0,0,0,0,0,0]
        sums[1][t[0]] = [0,0,0,0,0,0]
    # Determine if it is a test directory.
    inx = 0
    if test  and  os.path.basename(dirname) == 'test':
        inx = 1
    files = os.listdir(dirname)
    for file in files:
        if file not in ['.git', '.svn', '.cvs', 'CVS', 'doc']:
            ffile = os.path.join(dirname,file)
            try:
                mode = os.lstat(ffile).st_mode
            except OSError:
                sys.stderr.write ('No such file: %s\n' % ffile)
                continue
            if stat.S_ISLNK(mode):
                # skip symlinks because casacore contains symlink to itself
                continue
            elif stat.S_ISDIR(mode):
                cnts = countfiles (ffile, test, basic, ccperc, verbose, printlevel, level+1, usecode, dosum, warn_unknown)
                for j in [0,1]:
                    for t in types:
                        for i in range(len(sums[j][t[0]])):
                            sums[j][t[0]][i] += cnts[j][t[0]][i]
            elif stat.S_ISREG(mode):
                if not is_textfile(ffile):
                    type = 'binary'
                    ctyp = 0
                    cnt = (1,0,0,0,0,0)
                else:
                    fnd = False
                    (root,ext) = os.path.splitext(ffile)
                    if len(ext) > 0:
                        ext = ext[1:]   # remove .
                    for (type,exts,extre,filenms,comm,scomm,ecomm,ctyp) in types:
                        if file in filenms  or  ext in exts  or  (not extre is None and extre.match(ext)):
                            if type == 'ignore'  or  (usecode and ctyp==0):
                                cnt = (1,0,0,0,0,0)
                            else:
                                cnt = countcodecomm (ffile,comm,scomm,ecomm,basic)
                            fnd = True
                            break
                    if not fnd:
                        (type,ctyp,cnt) = countother (ffile, basic, usecode)
                if not usecode  or  ctyp != 0:
                    for i in range(len(cnt)):
                        sums[inx][type][i] += cnt[i]
                    if type == 'unknown':
                        if warn_unknown:
                            sys.stderr.write ('Unknown type: %s\n' % ffile)
                    elif verbose:
                        sys.stderr.write ('** %s\n' % ffile)
                        printCount (sys.stderr, type, cnt, ccperc);
    if level <= printlevel:
        bl = level*2*' '
        for j in [0,1]:
            first = True
            sumall = [0,0,0,0,0,0]
            for t in types:
                c = sums[j][t[0]]
                if c[0] > 0:
                    if first:
                        tc = ''
                        if j==1:
                            tc = ' testcode'
                        sys.stdout.write ('%s%s%s\n' % (bl,dirname,tc))
                        first = False
                    if dosum:
                        for i in range(len(c)):
                            sumall[i] += c[i]
                    else:
                        printCount (sys.stdout, t[0], c, ccperc);
            if dosum:
                printCount (sys.stdout, '', sumall, ccperc)
    return sums

def testit():
    print (countcodecomm ('/Users/diepen/testcnt1', '#'))
    print (countcodecomm ('/Users/diepen/testcnt2', '#', '"""', '"""'))
    print (countcodecomm ('/Users/diepen/testcnt1', '#', '', '', False))
    print (countcodecomm ('/Users/diepen/testcnt2', '#', '"""', '"""', False))
    

if __name__ == '__main__':
    # Define the options.
    parser = argparse.ArgumentParser(prog='PROG')
    parser.add_argument('-b', '--basic', help='count copyright header and lines without an alphanumeric character as code/comment lines', action='store_true')
    parser.add_argument('-c', '--code', help='only use source files containing code (e.g. no .parset)', action='store_true')
    parser.add_argument('-s', '--sum', help='only calculate and print the sum of all file types', action='store_true')
    parser.add_argument('-l', '--limitperc', help='limit to the nr of code and comment lines to determine percentages', action='store_true')
    parser.add_argument('-p', '--printlevel', type=int, default=0, help='first directory level to print (default 0 (=top))')
    parser.add_argument('-w', '--warn_unknown', help='warn if a file with an unknown type is found', action='store_true')
    parser.add_argument('-d', '--displaytypes', help='display the currently recognized file types (full info with -v)', action='store_true')
    parser.add_argument('-t', '--testinclude', help='do not count test directories separately', action='store_true')
    parser.add_argument('-v', '--verbose', help='print count for each source file', action='store_true')
    parser.add_argument('directory', nargs='?', default='.', help='name of top directory to count source files (default is .)')
    # If nothing given, do test and show options.
    if len(sys.argv) == 1:
        #print 'Testing the script ...'
        #testit()
        print ('')
        print ('countcode counts per known source file type the number of source lines in the')
        print (' files in the given directory and recursively in its subdirectories.')
        print ('It supports many file types. The type is recognized from the file name extension')
        print (' or the shebang script type. Use -s to see all supported types.')
        print ('The following line types are counted:')
        print ('  code:     pure code lines)')
        print ('  comment:  pure comment lines')
        print ('  blank:    empty lines or lines containing whitespace only')
        print ('  header:   the copyright header (leading comment lines)')
        print ('  other:    all other lines  (e.g., single {, /*, etc.)')
        print ('Unless -b is given, a pure code or comment line has to contain an alphanumeric')
        print (' character; e.g., a single } does not count as code line.')
        print ('It calculates the percentage of code and comment lines in the total number of')
        print (' lines or (if -l is given) in the sum of code and comment lines.')
        print ('Unless -t is given, files in test directories are counted separately.')
        print ('Normal output is written on stdout; verbose on stderr.')
        print ('Files with an unknown type are reported on stderr.')
        print ('Note that -bt should give about the same results as a tool like cloc.')
        print ('')
        parser.parse_args(['-h'])
    else:
        values = parser.parse_args(sys.argv[1:])
        if values.displaytypes:
            showTypes (values.verbose)
        else:
            dirname = values.directory
            test    = not values.testinclude
            # Remove possible trailing slash
            if len(dirname) > 1  and  dirname[-1] == '/':
                dirname = dirname[:-1]
            sys.stdout.write ('%s   Count %s  test=%d basic=%d limitperc=%d code=%d\n'%(time.ctime(),dirname,test,values.basic,values.limitperc,values.code))
            printHeader()
            countfiles (dirname, test, values.basic, values.limitperc, values.verbose, values.printlevel, 0, values.code, values.sum, values.warn_unknown)
            printHeader()
            sys.stdout.write ('%s   Count %s  test=%d basic=%d limitperc=%d code=%d\n'%(time.ctime(),dirname,test,values.basic,values.limitperc,values.code))
