#!/usr/bin/env python

# flawfinder: Find potential security flaws ("hits") in source code.
# Usage:
#   flawfinder [options] [source_code_file]+
#
# See the man page for a description of the options.
#
# The output is as follows:
# filename:line_number:column_number [risk_level] (type) function_name: message
#   where "risk_level" goes from 0 to 5. 0=no risk, 5=maximum risk.
# The final output is sorted by risk level, most risky first.
# 
# Currently this program can only analyze C/C++ code.
#
# Copyright (C) 2001 David A. Wheeler
# This is released under the General Public License (GPL):
# 
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


import sys, re, string, getopt, pickle


# Program Options - these are the default values:
show_context = 0
minimum_level = 1
show_immediately = 0
show_inputs = 0          # Only show inputs?
show_columns = 0
loadhitlist = None
savehitlist = None
diffhitlist = None


def error(message):
  sys.stderr.write("Error: %s\n"% message)


# Support routines: find a pattern.
# To simplify the calling convention, several global variables are used
# and these support routines are defined, in an attempt to make the
# actual calls simpler and clearer.
# 

filename = ""      # Source filename.
linenumber = 0     # Linenumber from original file.
ignoreline = -1    # Line number to ignore.


line_beginning = re.compile( r'(?m)^' )
blank_line     = re.compile( r'(?m)^\s+$' )


class Hit:
  """
  Each instance of Hit is a warning of some kind in a source code file.
  See the rulesets, which define the conditions for triggering a hit.
  Hit is initialized with a tuple containing the following:
    hook: function to call when function name found.
    level: (default) warning level, 0-5. 0=no problem, 5=very risky.
    warning: warning (text saying what's the problem)
    suggestion: suggestion (text suggesting what to do instead)
    category: One of "buffer" (buffer overflow), "race" (race condition),
              "tmpfile" (temporary file creation), "format" (format string).
              Use "" if you don't have a better category.
    url: URL fragment reference.
    other:  A dictionary with other settings.
 
  Other settings usually set:
  
    name: function name
    parameter: the function parameters (0th parameter null)
    input: set to 1 if the function inputs from external sources.
    start: start position (index) of the function name (in text)
    end:  end position of the function name (in text)
    filename: name of file
    line: line number in file
    column: column in line in file
    context_text: text surrounding hit"""

  # Set default values:
  source_position = 2 # By default, the second parameter is the source.
  format_position = 1 # By default, the first parameter is the format.
  input = 0           # By default, this doesn't read input.
  note = ""          # No additional notes.
  filename = ""      # Empty string is filename.

  def __init__(self, data):
    hook, level, warning, suggestion, category, url, other = data
    self.hook, self.level = hook, level
    self.warning, self.suggestion = warning, suggestion
    self.category, self.url = category, url
    for key in other.keys():
      setattr(self, key, other[key])

  def __cmp__(self, other):
    return (cmp(other.level, self.level) or  # Highest risk first.
            cmp(self.filename, other.filename) or 
            cmp(self.line, other.line) or
            cmp(self.column, other.column) or
            cmp(self.name, other.name))

  def __getitem__(self, X):   # Define this so this works: "%(line)" % hit
    return getattr(self, X)

  def show(self):
    sys.stdout.write("%s:" % self.filename)
    if show_columns:
      print ("%(line)s:%(column)s [%(level)s] (%(category)s) "
             "%(name)s: %(warning)s. %(suggestion)s. %(note)s" % self)
    else:
      print ("%(line)s [%(level)s] (%(category)s) "
             "%(name)s: %(warning)s. %(suggestion)s. %(note)s" % self)
    if show_context: print self.context_text


# The "hitlist" is the list of all hits (warnings) found so far.
# Use add_warning to add to it.

hitlist = []

def add_warning(hit):
  global hitlist
  if show_inputs and not hit.input: return
  if hit.level >= minimum_level and linenumber != ignoreline:
    hitlist.append(hit)
    if show_immediately:
      hit.show()


# C Language Specific

def extract_c_parameters(text, pos=0):
  "Return a list of the given C function's parameters, starting at text[pos]"
  # '(a,b)' produces ['', 'a', 'b']
  i = pos
  # Skip whitespace and find the "("; if there isn't one, return []:
  while i < len(text):
    if text[i] == '(':                 break
    elif text[i] in string.whitespace: i = i + 1
    else:                              return []
  else:  # Never found a reasonable ending.
    return []
  i = i + 1
  parameters = [""]  # Insert 0th entry, so 1st parameter is parameter[1].
  currentstart = i
  parenlevel = 1
  instring = 0
  incomment = 0
  while i < len(text):
    c = text[i]
    if instring:
      if c == '"': instring = 0
      elif c == '\\' and text[i:i+2]=='\\"': i = i + 1
    elif incomment:
      if c == '*' and text[i:i+2]=='*/':
        incomment = 0
        i = i + 1
    else:
      if c == '"': instring = 1
      elif c == '/' and text[i:i+2]=='/*':
         incomment = 1
         i = i + 1
      elif c == '/' and text[i:i+2]=='//':
         while i < len(text) and text[i] != "\n":
           i = i + 1
      elif c == '\\' and text[i:i+2]=='\\"': i = i + 1 # Handle '\"'
      elif c == '(': parenlevel = parenlevel + 1
      elif c == ',' and (parenlevel == 1):
        parameters.append(string.strip(text[currentstart:i]))
        currentstart = i + 1
      elif c == ')':
        parenlevel = parenlevel - 1
        if parenlevel <= 0:
            parameters.append(string.strip(text[currentstart:i]))
            return parameters
    i = i + 1
  warn("internal", 5, "Parsing failed to find end of parameter list", "")


def strip_surrounding_function(text, functionname):
  "If a call to function functionname surrounds text, remove the call; "
  "otherwise return text."
  # Presumes functionname has no regular expression characters in it.
  match = re.search(r'^\s*' + functionname + r'\s*\((.*)\)\s*$', text)
  if match: return string.strip(match.group(1))
  else:     return text 

def strip_i18n(text):
  "Strip any internationalization function calls surrounding 'text', "
  "such as gettext() and _()."
  text = strip_surrounding_function(text, 'gettext')
  text = strip_surrounding_function(text, '_')
  return text

p_c_singleton_string = re.compile( r'^\s*"([^\\]|\\[^0-6]|\\[0-6]+)?"\s*$')

def c_singleton_string(text):
  "Returns true if text is a C string with 0 or 1 character."
  if p_c_singleton_string.search(text): return 1
  else: return 0

p_c_constant_string = re.compile( r'^\s*"([^\\]|\\[^0-6]|\\[0-6]+)*"\s*$')

def c_constant_string(text):
  "Returns true if text is a constant C string."
  if p_c_constant_string.search(text): return 1
  else: return 0


# Precompile patterns for speed.

# Patterns indicating the beginning or ending of a chunk.
# I'm not using chunks right now, this may be removed in the future.
c_begin_chunk = re.compile( r'^\s*({|}|if|else|case|switch|for)\b' )
c_end_chunk   = re.compile( r'[{};]\s*$' )


# Currently this is unused:
def setdefault(dict, key, value):
  "Set dict[key]=value if it's not set, then return dict[key]."
  try:
    return dict[key]
  except KeyError:
    dict[key]=value
    return value


def c_buffer(hit):
  source_position = hit.source_position
  if source_position <= len(hit.parameters)-1:
    source=hit.parameters[source_position]
    if c_singleton_string(source):
      hit.level = 1
      hit.note = "Risk is low because the source is a constant character."
    elif c_constant_string(strip_i18n(source)):
      hit.level = max( hit.level - 2, 1)
      hit.note = "Risk is low because the source is a constant string."
  add_warning(hit)


def c_printf(hit):
  format_position = hit.format_position
  if format_position <= len(hit.parameters)-1:
    # Assume that translators are trusted to not insert "evil" formats:
    source = strip_i18n(hit.parameters[format_position])
    if c_constant_string(source):
      # Parameter is constant, so there's no risk of format string problems.
      if hit.name == "snprintf" or hit.name == "vsnprintf":
        hit.level = 1
        hit.warning = \
          "On some very old systems, snprintf is incorrectly implemented " \
          "and permits buffer overflows; there are also incompatible " \
          "standard definitions of it"
        hit.suggestion = "Check it during installation, or use something else"
        hit.category = "port"
      else:
        # We'll pass it on, just in case it's needed, but at level 0 risk.
        hit.level = 0
        hit.note = "Constant format string, so not considered risky."
  add_warning(hit)


p_dangerous_sprintf_format = re.compile(r'%-?([0-9]+|\*)?s')

# sprintf has both buffer and format vulnerabilities.
def c_sprintf(hit):
  source_position = hit.source_position
  if source_position <= len(hit.parameters)-1:
    source=hit.parameters[source_position]
    if c_singleton_string(source):
      hit.level = 1
      hit.note = "Risk is low because the source is a constant character."
    else:
      source = strip_i18n(source)
      if c_constant_string(source):
        if not p_dangerous_sprintf_format.search(source):
          hit.level = max( hit.level - 2, 1)
          hit.note = "Risk is low because the source has a constant maximum length."
        # otherwise, warn of potential buffer overflow (the default)
      else:
        # Ho ho - a nonconstant format string - we have a different problem.
        hit.warning = "Potential format string problem"
        hit.suggestion = "Make Format string constant"
        hit.level = 4
        hit.category = "format"
        hit.url = ""
  add_warning(hit)

p_dangerous_scanf_format = re.compile(r'%s')
p_low_risk_scanf_format = re.compile(r'%[0-9]+s')

def c_scanf(hit):
  format_position = hit.format_position
  if format_position <= len(hit.parameters)-1:
    # Assume that translators are trusted to not insert "evil" formats;
    # it's not clear that translators will be messing with INPUT formats,
    # but it's possible so we'll account for it.
    source = strip_i18n(hit.parameters[format_position])
    if c_constant_string(source):
      if p_dangerous_scanf_format.search(source): pass # Accept default.
      elif p_low_risk_scanf_format.search(source):
        hit.level = 1
        hit.note = "Only low-risk scanf formats detected."
      else:
        # No risky scanf request.
        # We'll pass it on, just in case it's needed, but at level 0 risk.
        hit.level = 0
        hit.note = "No risky scanf format detected."
    else:
        # Format isn't a constant.
        hit.note = "If the scanf format is influenceable by an attacker, it's exploitable."
  add_warning(hit)

def normal(hit):
  add_warning(hit)


# "c_ruleset": the rules for identifying "hits" in C (potential warnings).
# It's a dictionary, where the key is the function name causing the hit,
# and the value is a tuple with the following format:
#  (hook, level, warning, suggestion, category, {other})
# See the definition for class "Hit".
# The key can have multiple values separated with "|".

c_ruleset = {
  "strcpy" :
     (c_buffer, 4,
      "does not check for buffer overflows",
      "Consider using strncpy or strlcpy",
      "buffer", "", {}),
  "strcat" :
     (c_buffer, 4,
      "does not check for buffer overflows",
      "Consider using strncat or strlcat",
      "buffer", "", {}),
  "gets":
     (normal, 5, "does not check for buffer overflows",
      "Use fgets() instead", "buffer", "", {'input' : 1}),

  # The "sprintf" hook will raise "format" issues instead if appropriate:
  "sprintf|vsprintf":
     (c_sprintf, 4,
      "does not check for buffer overflows",
      "Use snprintf or vsnprintf",
      "buffer", "", {}),

  "printf|vprintf":
     (c_printf, 4,
      "if format strings can be influenced by an attacker, they can be "
      "exploited",
      "Use a constant for the format specification",
      "format", "", {}),

  "fprintf|vfprintf":
     (c_printf, 4,
      "if format strings can be influenced by an attacker, they can be "
      "exploited",
      "Use a constant for the format specification",
      "format", "", { 'format_position' : 2}),

  "snprintf|vsnprintf":
     (c_printf, 4,
      "if format strings can be influenced by an attacker, they can be "
      "exploited",
      "Use a constant for the format specification",
      "format", "", { 'format_position' : 3}),

  "scanf|vscanf":
     (c_scanf, 4,
      "the scanf() family's %s operation, without a limit specification, "
      "permits buffer overflows",
      "Specify a limit to %s, or use a different input function",
      "buffer", "", {'input' : 1}),

  "fscanf|sscanf|vsscanf|vfscanf":
     (c_scanf, 4,
      "the scanf() family's %s operation, without a limit specification, "
      "permits buffer overflows",
      "Specify a limit to %s, or use a different input function",
      "buffer", "", {'input' : 1, 'format_position' : 2}),

  "realpath|getopt|getpass|streadd|strecpy|strtrns|getwd":
     (normal, 3,
     "this does not protect against buffer overflows "
     "by itself, so use with caution",
      "",
      "buffer", "dangers-c", {}),

  "access":        # ???: TODO: analyze TOCTOU more carefully.
     (normal, 4,
      "this usually indicates a security flaw.  If an "
      "attacker can change anything along the path between the "
      "call to access() and the file's actual use (e.g., by moving "
      "files), the attacker can exploit the race condition",
      "Set up the correct permissions (e.g., using setuid()) and "
      "try to open the file directly", "race",
      "avoid-race#atomic-filesystem", {}),
  "chown":
     (normal, 5,
      "this accepts filename arguments; if an attacker "
      "can move those files, a race condition results. ",
      "Use fchown( ) instead",
      "race", "", {}),
  "chgrp":
     (normal, 5,
      "this accepts filename arguments; if an attacker "
      "can move those files, a race condition results. ",
      "Use fchgrp( ) instead",
      "race", "", {}),
  "chmod":
     (normal, 5,
      "this accepts filename arguments; if an attacker "
      "can move those files, a race condition results. ",
      "Use fchmod( ) instead",
      "race", "", {}),
  "vfork":
     (normal, 2,
      "on some old systems, vfork() permits race conditions, and it's "
      "very difficult to use correctly", 
      "Use fork() instead",
      "race", "", {}),
  "tmpfile":
     (normal, 2,
      "tmpfile() has a security flaw on some systems (e.g., older "
      "System V systems)",
      "",
      "tmpfile", "", {}),
  "tmpnam|tempnam":
     (normal, 3,
      "temporary file race condition",
      "",
      "tmpfile", "avoid-race", {}),

  # TODO: Detect GNOME approach to mktemp and ignore it.
  "mktemp":
     (normal, 4,
      "temporary file race condition",
      "",
      "tmpfile", "avoid-race", {}),

  # TODO: Need to detect varying levels of danger.
  "execl|execlp|execle|execv|execvp|system|popen":
     (normal, 4,
      "this calls out to a new process and is difficult to use safely",
      "try using a library call that implements the same functionality "
      "if available.",
      "tmpfile", "", {}),

  # TODO: There are many more rules to add, esp. for TOCTOU.
  }

template_ruleset = {
  # This is a template for adding new entries (the key is impossible):
  "9":
     (normal, 2,
      "",
      "",
      "tmpfile", "", {}),
  }


def find_column(text, position):
  "Find column number inside line."
  newline = string.rfind(text, "\n", 0, position)
  if newline == -1:
    return position + 1
  else:
    return position - newline

def get_context(text, position):
  "Get surrounding text line starting from text[position]"
  linestart = string.rfind(text, "\n", 0, position+1) + 1
  lineend   = string.find(text, "\n", position, len(text))
  if lineend == -1: lineend = len(text)
  return text[linestart:lineend]

def process_directive():
  "Given a directive, process it."
  # TODO: Currently this is just a stub routine that simply removes
  # hits from the current line, if any, and sets a flag if not.
  hitfound = 0
  # Iterate backwards over hits, to be careful about the destructive iterator
  for i in xrange(len(hitlist)-1, -1, -1):
    if hitlist[i].line == linenumber:
      del hitlist[i] # DESTROY - this is a DESTRUCTIVE iterator.
      hitfound = 1   # Don't break, because there may be more than one.
  if not hitfound:
    ignoreline = linenumber + 1  # Nothing found - ignore next line.

# Characters that can be in a string.
# 0x4, 4.4e4, etc.
numberset=string.hexdigits+"_x.Ee"

# Patterns for various circumstances:
p_include = re.compile( r'#\s*include\s+(<.*?>|".*?")' )
p_digits  = re.compile( r'[0-9]' )
p_alphaunder = re.compile( r'[A-Za-z_]' )  # Alpha chars and underline.
# A "word" in C.  Note that "$" is permitted -- it's not permitted by the
# C standard in identifiers, but gcc supports it as an extension.
p_c_word = re.compile( r'[A-Za-z_][A-Za-z_0-9$]*' )
# We'll recognize ITS4 directives, as well as our own,
# for compatibility's sake:
p_directive = re.compile( r'(?i)\s*(ITS4|Flawfinder):\s*([^\*]*)' )


def process_c_file(f):
  global filename, linenumber, ignoreline
  filename=f
  linenumber = 1
  ignoreline = -1

  incomment = 0
  instring = 0
  linebegin = 1

  if f == "-":
   input = sys.stdin
  else:
   input = open(f, "r")

  # Read ENTIRE file into memory.  Use readlines() to convert \n if necessary.
  # This turns out to be very fast in Python, even on large files, and it
  # eliminates lots of range checking later, making the result faster.

  text = string.join(input.readlines(),"")

  i = 0
  while i < len(text):
    # This is a trivial tokenizer that just tries to find "words", which
    # match [A-Za-z_][A-Za-z0-9_]*.  It skips comments & strings.
    # It also skips "#include <...>", which must be handled specially
    # because "<" and ">" aren't usually delimiters.
    # It doesn't bother to tokenize anything else, since it's not used.
    # The following is a state machine with 3 states: incomment, instring,
    # and "normal", and a separate state "linebegin" if at BOL.
    if linebegin:  # If at beginning of line, see if #include is there.
       linebegin = 0
       m = p_include.match(text,i)
       if m:  # Found #include, skip it.  Otherwise: #include <stdio.h>
         i = m.end(0)
         continue
    c = text[i]
    if c == "\n":
      linenumber = linenumber + 1
      linebegin = 1
    i = i + 1   # From here on, text[i] points to next character.
    if c == " ": continue  # Skip whitespace.
    if i < len(text): nextc = text[i]
    else:             nextc = ''
    if incomment:
       if c=='*' and nextc=='/':
           i = i + 1
           incomment = 0
    elif instring:
       if c == '\\' and nextc == '"':
          i = i + 1
       elif c == '"':
          instring = 0
    else:
      if c=='/' and nextc=='*':
          m = p_directive.match(text, i)  # Is there a directive here?
          if m:
            process_directive()
          i = i + 1
          incomment = 1
      elif c=='/' and nextc=='/':  # "//" comments - skip to EOL.
          m = p_directive.match(text, i)  # Is there a directive here?
          if m:
            process_directive()
          while i<len(text) and text[i] != "\n":
            i = i + 1
      elif c=='"':
          instring = 1
      else:
          m = p_c_word.match(text, i-1)
          if m:                        # Do we have a word?
            startpos=i-1
            endpos = m.end(0)
            i = endpos
            word = text[startpos:endpos]
            # print "Word is:", text[startpos:endpos]
            if c_ruleset.has_key(word):  # FOUND A MATCH, setup & call hook.
              # print "HIT: #%s#\n" % word
              hit = Hit(c_ruleset[word])
              hit.name = word
              hit.start, hit.end = startpos, endpos
              hit.line = linenumber
              hit.line, hit.column = linenumber, find_column(text, startpos)
              hit.filename=filename
              hit.context_text = get_context(text, startpos)
              hit.parameters = extract_c_parameters(text, endpos)
              apply(hit.hook, (hit, ))
          elif p_digits.match(c):
            while i<len(text) and p_digits.match(text[i]): # Process a number.
              i = i + 1
          # else some other character, which we ignore.

def expand_ruleset(ruleset):
  # Rulesets can have compressed sets of rules
  # (multiple function names separated by "|".
  # Expand the given ruleset.
  # Note that this for loop modifies the ruleset while it's iterating!
  for rule in ruleset.keys():
    if string.find(rule, "|") != -1:  # We found a rule to expand.
      for newrule in string.split(rule, "|"):
        ruleset[newrule] = ruleset[rule]
      del ruleset[rule]
  # To print out the set of keys in the expanded ruleset, run:
  #   print `ruleset.keys()`

def initialize():
  expand_ruleset(c_ruleset)
  print "Number of dangerous functions in C ruleset:", len(c_ruleset)

def process_options():
  global show_context, show_inputs, minimum_level, show_immediately
  global show_columns
  global loadhitlist, savehitlist, diffhitlist
  # Note - as a side-effect, this sets sys.argv[].
  optlist, args = getopt.getopt(sys.argv[1:], "cm:i",
                  ["context", "minlevel=", "immediate", "inputs", "columns",
                   "loadhitlist=", "savehitlist=", "diffhitlist=" ])
  for (opt,value) in optlist:
    if   opt == "--context" or opt == "-c":
      show_context = 1
    elif opt == "--columns":
      show_columns = 1
    elif opt == "--inputs":
      show_inputs = 1
      minimum_level = 0
    elif opt == "--minlevel" or opt == "-m":
      minimum_level = string.atoi(value)
    elif opt == "--immediate" or opt == "-i":
      show_immediately = 1
    elif opt == "--loadhitlist":
      loadhitlist = value
      print "Loading hits from", value
    elif opt == "--savehitlist":
      savehitlist = value
      print "Saving hitlist to", value
    elif opt == "--diffhitlist":
      diffhitlist = value
      print "Showing hits not in", value
  sys.argv[1:] = args


def process_files():
  global hitlist
  if loadhitlist:
    f = open(loadhitlist)
    hitlist = pickle.load(f)
  else:
    for f in sys.argv[1:]:
      print "Processing", f
      process_c_file(f)

def show_final_results():
  global hitlist
  if show_immediately:   # Separate the final results.
    print
    print "FINAL RESULTS:"
  hitlist.sort()
  if diffhitlist:
    diff_file = open(diffhitlist)
    diff_hitlist = pickle.load(diff_file)
    for h in hitlist:
      if h not in diff_hitlist:
        h.show()
    diff_file.close()
  else:
    for h in hitlist:
      h.show()
  print "There are probably other security vulnerabilities as well; review your code!"

def save_if_desired():
  # We'll save entire hitlist, even if only differences displayed.
  if savehitlist:
    print "Saving hitlist to", savehitlist
    f = open(savehitlist, "w")
    pickle.dump(hitlist, f)
    f.close()

def flawfind():
  print "Flawfinder version 0.12, (C) 2001 David A. Wheeler."
  initialize()
  process_options()
  process_files()
  show_final_results()
  save_if_desired()

if __name__ == '__main__':
  flawfind()