ottertune/script/validators/source_validator.py

#!/usr/bin/env python
# encoding: utf-8
#
# OtterTune - source_validator.py
#
# Copyright (c) 2017-18, Carnegie Mellon University Database Group
#

# ==============================================
# SOURCE VALIDATOR
# ==============================================
#
# Adapted from the source validator used by Peloton.
# (see https://github.com/cmu-db/peloton/blob/master/script/validators/source_validator.py)

import argparse
import logging
import imp
import os
import re
import sys
import json
import functools
from collections import namedtuple
from fabric.api import lcd, local, settings, quiet

EXIT_SUCCESS = 0
EXIT_FAILURE = -1

# ==============================================
# CONFIGURATION
# ==============================================

# Logging
LOG = logging.getLogger(__name__)
LOG.addHandler(logging.StreamHandler())
LOG.setLevel(logging.INFO)

# NOTE: the absolute path to ottertune directory is calculated from current
# directory structure: ottertune/server/website/scripts/validators/<this_file>
# OTTERTUNE_DIR needs to be redefined if the directory structure is changed.
CODE_SOURCE_DIR = os.path.abspath(os.path.dirname(__file__))
OTTERTUNE_DIR = os.path.abspath(functools.reduce(os.path.join,
                                                 [CODE_SOURCE_DIR,
                                                  os.path.pardir,
                                                  os.path.pardir]))

# Other directory paths used are relative to OTTERTUNE_DIR
DEFAULT_DIRS = [
    OTTERTUNE_DIR
]

# Directories that should NOT be checked
EXCLUDE_DIRECTORIES = [
    # Django-generated directories
    os.path.join(OTTERTUNE_DIR, "server/website/website/migrations"),

    # Source code files from json.org
    os.path.join(OTTERTUNE_DIR, "client/controller/src/main/java/com/controller/util/json"),

    # Django settings
    os.path.join(OTTERTUNE_DIR, 'server/website/website/settings'),

    # Docker files
    os.path.join(OTTERTUNE_DIR, 'docker'),

    # Django manage.py extensions
    os.path.join(OTTERTUNE_DIR, "server/website/website/management"),

    # Stand-alone scripts
    os.path.join(OTTERTUNE_DIR, "server/website/script"),
]

# Files that should NOT be checked
EXCLUDE_FILES = [
    # Django-generated files
    os.path.join(OTTERTUNE_DIR, 'server/website/manage.py'),
    # file causing import error
    os.path.join(OTTERTUNE_DIR, 'server/analysis/simulation.py'),
]

# Regex patterns
PYCODESTYLE_COMMENT_PATTERN = re.compile(r'#\s*pycodestyle:\s*disable\s*=\s*[\w\,\s]+$')

PYTHON_ILLEGAL_PATTERNS = [
    (re.compile(r'^print[ (]'), "Do not use 'print'. Use the logging module instead.")
]

JAVA_ILLEGAL_PATTERNS = [
    (re.compile(r'^System.out.println'), "Do not use println. Use the logging module instead.")
]

PYTHON_HEADER_PATTERN = re.compile(r'#\n#.*\n#\n# Copyright.*\n#\n')
JAVA_HEADER_PATTERN = re.compile(r'/\*\n \*.*\n \*\n \* Copyright.*\n \*/\n\n')

# Stdout format strings
SEPARATOR = 80 * '-'
OUTPUT_FMT = (
    '' + SEPARATOR + '\n\n'
    '\033[1m'        # start bold text
    '%s\n'
    'FAILED: %s\n\n'
    '\033[0m'        # end bold text
    '%s'
)
VALIDATOR_FMT = '{name}\n{u}\n{out}'.format
MSG_PREFIX_FMT = ' {filename}:{line:3d}: '.format
MSG_SUFFIX_FMT = ' ({symbol})'.format


# ==============================================
# UTILITY FUNCTION DEFINITIONS
# ==============================================

def format_message(filename, line, message, symbol=None):
    out_prefix = MSG_PREFIX_FMT(filename=filename, line=line)
    out_suffix = '' if symbol is None else MSG_SUFFIX_FMT(symbol=symbol)

    # Crop the message details to make the output more readable
    max_msg_len = 80 - len(out_prefix) - len(out_suffix)
    if len(message) > max_msg_len:
        message = message[:max_msg_len - 3] + '...'
    output = (out_prefix + message + out_suffix).replace('\n', '')
    return output + '\n'


def validate_validator(modules, config_path):
    status = True

    # Check if required modules are installed
    for module in modules:
        if module is not None:
            try:
                imp.find_module(module)
            except ImportError:
                LOG.error("Cannot find module %s", module)
                status = False

    # Check that the config file exists if assigned
    if config_path is not None and not os.path.isfile(config_path):
        LOG.error("Cannot find config file %s", config_path)
        status = False
    return status


# Validate the file passed as argument
def validate_file(file_path):
    if file_path in EXCLUDE_FILES:
        return True
    if not file_path.endswith(".py") and not file_path.endswith(".java"):
        return True
    for exclude_dir in EXCLUDE_DIRECTORIES:
        if file_path.startswith(exclude_dir):
            return True

    LOG.debug("Validating file: %s", file_path)
    status = True
    output = []
    failed_validators = []
    for validator in VALIDATORS:
        val_status, val_output = validator.validate_fn(
            file_path, validator.config_path)
        if not val_status:
            status = False
            output.append(VALIDATOR_FMT(name=validator.name,
                                        u='-' * len(validator.name),
                                        out=val_output))
            failed_validators.append(validator.name)
    if not status:
        LOG.info(OUTPUT_FMT, file_path, ', '.join(failed_validators), '\n'.join(output))
    return status


# Validate all the files in the root_dir passed as argument
def validate_dir(root_dir):
    for exclude_dir in EXCLUDE_DIRECTORIES:
        if root_dir.startswith(exclude_dir):
            return True

    status = True
    for root, dirs, files in os.walk(root_dir):  # pylint: disable=not-an-iterable
        # Remove excluded dirs from list
        valid_dirs = []
        for d in dirs:
            valid = True
            for exclude_dir in EXCLUDE_DIRECTORIES:
                if d.startswith(exclude_dir):
                    valid = False
                    break
            if valid:
                valid_dirs.append(d)
        dirs[:] = valid_dirs

        # Validate files
        for file_path in files:
            file_path = os.path.join(root, file_path)

            if not validate_file(file_path):
                status = False
    return status


def get_git_files(state):
    if state == 'staged':
        # Files staged for commit
        cmd = r"git diff --name-only --cached --diff-filter=d | grep -E '.*\.(py|java)$'"

    elif state == 'unstaged':
        # Tracked files not staged for commit
        cmd = r"git diff --name-only --diff-filter=d | grep -E '.*\.(py|java)$'"

    elif state == 'untracked':
        # Untracked files not staged for commit
        cmd = r"git ls-files --other --exclude-standard | grep -E '.*\.(py|java)$'"

    with settings(warn_only=True):
        res = local(cmd, capture=True)

    if res.succeeded:
        targets = res.stdout.strip().split('\n')

        if not targets:
            LOG.warning("No %s files found.", state)
    else:
        LOG.error("An error occurred while fetching %s files (exit code %d). "
                  "Exiting...\n\n%s\n", state, res.return_code, res.stderr)
        sys.exit(EXIT_FAILURE)

    return targets


# ==============================================
# VALIDATOR FUNCTION DEFINITIONS
# ==============================================

def check_pylint(file_path, config_path=None):
    if not file_path.endswith(".py"):
        return True, None

    options = [
        '--output-format=json',
        '--reports=yes',
    ]
    if config_path is not None:
        options.append('--rcfile=' + config_path)

    with settings(warn_only=True), quiet():
        res = local('pylint {} {}'.format(' '.join(options), file_path), capture=True)

    if res.stdout == '':
        if res.return_code != 0:
            raise Exception(
                'An error occurred while running pylint on {} (exit code {}).\n\n{}\n'.format(
                    file_path, res.return_code, res.stderr))
        return True, None

    output = []
    errors = json.loads(res.stdout)
    for entry in errors:
        # Remove extra whitespace and hints
        msg = entry['message'].replace('^', '').replace('|', '')
        msg = re.sub(' +', ' ', msg)
        msg = msg.strip()
        output.append(format_message(os.path.basename(file_path), entry['line'],
                                     msg, entry['symbol']))
    output = ''.join(output)
    return res.return_code == 0, output


def check_pycodestyle(file_path, config_path=None):
    import pycodestyle

    if not file_path.endswith(".py"):
        return True, None

    # A custom reporter class for pycodestyle that checks for disabled errors
    # and formats the style report output.
    class CustomReporter(pycodestyle.StandardReport):
        def get_file_results(self):
            # Iterates through the lines of code that generated lint errors and
            # checks if the given error has been disabled for that line via an
            # inline comment (e.g., # pycodestyle: disable=E201,E226). Those
            # that have been disabled are not treated as errors.
            self._deferred_print.sort()
            results = []
            prev_line_num = -1
            prev_line_errs = []
            for line_number, _, code, text, _ in self._deferred_print:
                if prev_line_num == line_number:
                    err_codes = prev_line_errs
                else:
                    line = self.lines[line_number - 1]
                    m = PYCODESTYLE_COMMENT_PATTERN.search(line)
                    if m and m.group(0):
                        err_codes = [ec.strip() for ec in m.group(0).split('=')[1].split(',')]
                    else:
                        err_codes = []
                prev_line_num = line_number
                prev_line_errs = err_codes
                if code in err_codes:
                    # Error is disabled in source
                    continue

                results.append(format_message(os.path.basename(file_path),
                                              self.line_offset + line_number,
                                              text, code))
            return results, len(results) == 0
    # END CustomReporter class

    options = {} if config_path is None else {'config_file': config_path}
    style = pycodestyle.StyleGuide(quiet=True, **options)

    # Set the reporter option to our custom one
    style.options.reporter = CustomReporter
    style.init_report()
    report = style.check_files([file_path])
    results, status = report.get_file_results()
    output = None if status else ''.join(results)
    return status, output


def check_java_checkstyle(file_path, config_path=None):
    if not file_path.endswith(".java"):
        return True, None

    options = '' if config_path is None else '-c ' + config_path
    with quiet():
        res = local("checkstyle {} {}".format(options, file_path), capture=True)
    lines = res.stdout.split('\n')
    assert len(lines) >= 2 and lines[0] == "Starting audit..." and lines[-1] == "Audit done."
    if len(lines) == 2:
        return True, None
    output = []
    for line in lines[1:-1]:
        parts = line.strip().split(':')
        line_number = int(parts[1])
        text, code = parts[-1].rsplit('[', 1)
        text = text.strip()
        code = code[:-1]
        output.append(format_message(os.path.basename(file_path), line_number, text, code))
    output = ''.join(output)
    return False, output


def check_illegal_patterns(file_path, config_path=None):  # pylint: disable=unused-argument
    if file_path.endswith(".py"):
        illegal_patterns = PYTHON_ILLEGAL_PATTERNS
        comment = "#"
    elif file_path.endswith(".java"):
        illegal_patterns = JAVA_ILLEGAL_PATTERNS
        comment = "//"
    else:
        return True, None

    line_num = 1
    output = []
    status = True
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            for pattern_info in illegal_patterns:
                if not line.startswith(comment) and pattern_info[0].search(line):
                    output.append(format_message(filename=os.path.basename(file_path),
                                                 line=line_num,
                                                 message=pattern_info[1]))
                    status = False
            line_num += 1
    output = None if status else ''.join(output)
    return status, output


def check_header(file_path, config_file=None):  # pylint: disable=unused-argument
    if file_path.endswith(".py"):
        header_pattern = PYTHON_HEADER_PATTERN
    elif file_path.endswith(".java"):
        header_pattern = JAVA_HEADER_PATTERN
    else:
        return True, None

    status = True
    output = None
    with open(file_path, 'r') as f:
        file_contents = f.read()

    header_match = header_pattern.search(file_contents)
    filename = os.path.basename(file_path)
    if header_match:
        if filename not in header_match.group(0):
            status = False
            output = format_message(filename=filename, line=2,
                                    message="Incorrect filename in header")

    else:
        status = False
        output = format_message(filename=filename, line=1,
                                message='Missing header')
    return status, output


# ==============================================
# VALIDATORS
# ==============================================

# Struct for storing validator metadata
Validator = namedtuple('Validator', 'name validate_fn modules config_path')

VALIDATORS = [
    # Runs pylint on python source
    Validator('check_pylint', check_pylint, ['pylint'],
              os.path.join(OTTERTUNE_DIR, "script/formatting/config/pylintrc")),

    # Runs pycodestyle on python source
    Validator('check_pycodestyle', check_pycodestyle, ['pycodestyle'],
              os.path.join(OTTERTUNE_DIR, "script/formatting/config/pycodestyle")),

    # Runs checkstyle on the java source
    Validator("check_java_checkstyle", check_java_checkstyle, [],
              os.path.join(OTTERTUNE_DIR, "script/formatting/config/google_checks.xml")),

    # Checks that the python/java source files do not use illegal patterns
    Validator('check_illegal_patterns', check_illegal_patterns, [], None),

    # Checks that the python/java source files have headers
    Validator('check_header', check_header, [], None)
]


# ==============================================
# MAIN FUNCTION
# ==============================================

def main():
    parser = argparse.ArgumentParser(description="Validate OtterTune's source code")
    parser.add_argument('paths', metavar='PATH', type=str, nargs='*',
                        help='Files or directories to (recursively) validate')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='Enable verbose output')
    parser.add_argument('--staged-files', action='store_true',
                        help='Apply the selected action(s) to all staged files (git)')
    parser.add_argument('--unstaged-files', action='store_true',
                        help='Apply the selected action(s) to all unstaged tracked files (git)')
    parser.add_argument('--untracked-files', action='store_true',
                        help='Apply the selected action(s) to all untracked files (git)')
    args = parser.parse_args()

    if args.verbose:
        LOG.setLevel(logging.DEBUG)

    LOG.info('\nRunning source validators:\n%s\n',
             '\n'.join('  ' + v.name for v in VALIDATORS))
    for validator in VALIDATORS:
        if not validate_validator(validator.modules, validator.config_path):
            sys.exit(EXIT_FAILURE)

    targets = []

    if args.paths or args.staged_files or args.unstaged_files or args.untracked_files:
        if args.paths:
            targets += args.paths

        if args.staged_files:
            targets += get_git_files('staged')

        if args.unstaged_files:
            targets += get_git_files('unstaged')

        if args.untracked_files:
            targets += get_git_files('untracked')

        if not targets:
            LOG.error("No files/directories found. Exiting...")
            sys.exit(EXIT_FAILURE)

    else:
        targets = DEFAULT_DIRS

    targets = sorted(os.path.abspath(t) for t in targets)
    LOG.info('\nFiles/directories to validate:\n%s\n',
             '\n'.join('  ' + t for t in targets))

    status = True
    for target in targets:
        if os.path.isfile(target):
            LOG.debug("Scanning file: %s\n", target)
            target_status = validate_file(target)
        elif os.path.isdir(target):
            LOG.debug("Scanning directory: %s\n", target)
            target_status = validate_dir(target)
        else:
            LOG.error("%s isn't a file or directory", target)
            sys.exit(EXIT_FAILURE)

        if not target_status:
            status = False

    if not status:
        LOG.info(SEPARATOR + '\n')
        LOG.info("Validation NOT successful\n")
        sys.exit(EXIT_FAILURE)

    LOG.info("Validation successful\n")
    sys.exit(EXIT_SUCCESS)


if __name__ == '__main__':
    main()