#!/usr/bin/env python # flawfinder: Find potential security flaws ("hits") in source code. # Usage: # flawfinder [options] [source_code_file]+ # # See the man page for a description of the options. # # The output is as follows: # filename:line_number:column_number [risk_level] (type) function_name: message # where "risk_level" goes from 0 to 5. 0=no risk, 5=maximum risk. # The final output is sorted by risk level, most risky first. # # Currently this program can only analyze C/C++ code. # # Copyright (C) 2001 David A. Wheeler # This is released under the General Public License (GPL): # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import sys, re, string, getopt, pickle # Program Options - these are the default values: show_context = 0 minimum_level = 1 show_immediately = 0 show_inputs = 0 # Only show inputs? show_columns = 0 loadhitlist = None savehitlist = None diffhitlist = None def error(message): sys.stderr.write("Error: %s\n"% message) # Support routines: find a pattern. # To simplify the calling convention, several global variables are used # and these support routines are defined, in an attempt to make the # actual calls simpler and clearer. # filename = "" # Source filename. linenumber = 0 # Linenumber from original file. ignoreline = -1 # Line number to ignore. line_beginning = re.compile( r'(?m)^' ) blank_line = re.compile( r'(?m)^\s+$' ) class Hit: """ Each instance of Hit is a warning of some kind in a source code file. See the rulesets, which define the conditions for triggering a hit. Hit is initialized with a tuple containing the following: hook: function to call when function name found. level: (default) warning level, 0-5. 0=no problem, 5=very risky. warning: warning (text saying what's the problem) suggestion: suggestion (text suggesting what to do instead) category: One of "buffer" (buffer overflow), "race" (race condition), "tmpfile" (temporary file creation), "format" (format string). Use "" if you don't have a better category. url: URL fragment reference. other: A dictionary with other settings. Other settings usually set: name: function name parameter: the function parameters (0th parameter null) input: set to 1 if the function inputs from external sources. start: start position (index) of the function name (in text) end: end position of the function name (in text) filename: name of file line: line number in file column: column in line in file context_text: text surrounding hit""" # Set default values: source_position = 2 # By default, the second parameter is the source. format_position = 1 # By default, the first parameter is the format. input = 0 # By default, this doesn't read input. note = "" # No additional notes. filename = "" # Empty string is filename. def __init__(self, data): hook, level, warning, suggestion, category, url, other = data self.hook, self.level = hook, level self.warning, self.suggestion = warning, suggestion self.category, self.url = category, url for key in other.keys(): setattr(self, key, other[key]) def __cmp__(self, other): return (cmp(other.level, self.level) or # Highest risk first. cmp(self.filename, other.filename) or cmp(self.line, other.line) or cmp(self.column, other.column) or cmp(self.name, other.name)) def __getitem__(self, X): # Define this so this works: "%(line)" % hit return getattr(self, X) def show(self): sys.stdout.write("%s:" % self.filename) if show_columns: print ("%(line)s:%(column)s [%(level)s] (%(category)s) " "%(name)s: %(warning)s. %(suggestion)s. %(note)s" % self) else: print ("%(line)s [%(level)s] (%(category)s) " "%(name)s: %(warning)s. %(suggestion)s. %(note)s" % self) if show_context: print self.context_text # The "hitlist" is the list of all hits (warnings) found so far. # Use add_warning to add to it. hitlist = [] def add_warning(hit): global hitlist if show_inputs and not hit.input: return if hit.level >= minimum_level and linenumber != ignoreline: hitlist.append(hit) if show_immediately: hit.show() # C Language Specific def extract_c_parameters(text, pos=0): "Return a list of the given C function's parameters, starting at text[pos]" # '(a,b)' produces ['', 'a', 'b'] i = pos # Skip whitespace and find the "("; if there isn't one, return []: while i < len(text): if text[i] == '(': break elif text[i] in string.whitespace: i = i + 1 else: return [] else: # Never found a reasonable ending. return [] i = i + 1 parameters = [""] # Insert 0th entry, so 1st parameter is parameter[1]. currentstart = i parenlevel = 1 instring = 0 incomment = 0 while i < len(text): c = text[i] if instring: if c == '"': instring = 0 elif c == '\\' and text[i:i+2]=='\\"': i = i + 1 elif incomment: if c == '*' and text[i:i+2]=='*/': incomment = 0 i = i + 1 else: if c == '"': instring = 1 elif c == '/' and text[i:i+2]=='/*': incomment = 1 i = i + 1 elif c == '/' and text[i:i+2]=='//': while i < len(text) and text[i] != "\n": i = i + 1 elif c == '\\' and text[i:i+2]=='\\"': i = i + 1 # Handle '\"' elif c == '(': parenlevel = parenlevel + 1 elif c == ',' and (parenlevel == 1): parameters.append(string.strip(text[currentstart:i])) currentstart = i + 1 elif c == ')': parenlevel = parenlevel - 1 if parenlevel <= 0: parameters.append(string.strip(text[currentstart:i])) return parameters i = i + 1 warn("internal", 5, "Parsing failed to find end of parameter list", "") def strip_surrounding_function(text, functionname): "If a call to function functionname surrounds text, remove the call; " "otherwise return text." # Presumes functionname has no regular expression characters in it. match = re.search(r'^\s*' + functionname + r'\s*\((.*)\)\s*$', text) if match: return string.strip(match.group(1)) else: return text def strip_i18n(text): "Strip any internationalization function calls surrounding 'text', " "such as gettext() and _()." text = strip_surrounding_function(text, 'gettext') text = strip_surrounding_function(text, '_') return text p_c_singleton_string = re.compile( r'^\s*"([^\\]|\\[^0-6]|\\[0-6]+)?"\s*$') def c_singleton_string(text): "Returns true if text is a C string with 0 or 1 character." if p_c_singleton_string.search(text): return 1 else: return 0 p_c_constant_string = re.compile( r'^\s*"([^\\]|\\[^0-6]|\\[0-6]+)*"\s*$') def c_constant_string(text): "Returns true if text is a constant C string." if p_c_constant_string.search(text): return 1 else: return 0 # Precompile patterns for speed. # Patterns indicating the beginning or ending of a chunk. # I'm not using chunks right now, this may be removed in the future. c_begin_chunk = re.compile( r'^\s*({|}|if|else|case|switch|for)\b' ) c_end_chunk = re.compile( r'[{};]\s*$' ) # Currently this is unused: def setdefault(dict, key, value): "Set dict[key]=value if it's not set, then return dict[key]." try: return dict[key] except KeyError: dict[key]=value return value def c_buffer(hit): source_position = hit.source_position if source_position <= len(hit.parameters)-1: source=hit.parameters[source_position] if c_singleton_string(source): hit.level = 1 hit.note = "Risk is low because the source is a constant character." elif c_constant_string(strip_i18n(source)): hit.level = max( hit.level - 2, 1) hit.note = "Risk is low because the source is a constant string." add_warning(hit) def c_printf(hit): format_position = hit.format_position if format_position <= len(hit.parameters)-1: # Assume that translators are trusted to not insert "evil" formats: source = strip_i18n(hit.parameters[format_position]) if c_constant_string(source): # Parameter is constant, so there's no risk of format string problems. if hit.name == "snprintf" or hit.name == "vsnprintf": hit.level = 1 hit.warning = \ "On some very old systems, snprintf is incorrectly implemented " \ "and permits buffer overflows; there are also incompatible " \ "standard definitions of it" hit.suggestion = "Check it during installation, or use something else" hit.category = "port" else: # We'll pass it on, just in case it's needed, but at level 0 risk. hit.level = 0 hit.note = "Constant format string, so not considered risky." add_warning(hit) p_dangerous_sprintf_format = re.compile(r'%-?([0-9]+|\*)?s') # sprintf has both buffer and format vulnerabilities. def c_sprintf(hit): source_position = hit.source_position if source_position <= len(hit.parameters)-1: source=hit.parameters[source_position] if c_singleton_string(source): hit.level = 1 hit.note = "Risk is low because the source is a constant character." else: source = strip_i18n(source) if c_constant_string(source): if not p_dangerous_sprintf_format.search(source): hit.level = max( hit.level - 2, 1) hit.note = "Risk is low because the source has a constant maximum length." # otherwise, warn of potential buffer overflow (the default) else: # Ho ho - a nonconstant format string - we have a different problem. hit.warning = "Potential format string problem" hit.suggestion = "Make Format string constant" hit.level = 4 hit.category = "format" hit.url = "" add_warning(hit) p_dangerous_scanf_format = re.compile(r'%s') p_low_risk_scanf_format = re.compile(r'%[0-9]+s') def c_scanf(hit): format_position = hit.format_position if format_position <= len(hit.parameters)-1: # Assume that translators are trusted to not insert "evil" formats; # it's not clear that translators will be messing with INPUT formats, # but it's possible so we'll account for it. source = strip_i18n(hit.parameters[format_position]) if c_constant_string(source): if p_dangerous_scanf_format.search(source): pass # Accept default. elif p_low_risk_scanf_format.search(source): hit.level = 1 hit.note = "Only low-risk scanf formats detected." else: # No risky scanf request. # We'll pass it on, just in case it's needed, but at level 0 risk. hit.level = 0 hit.note = "No risky scanf format detected." else: # Format isn't a constant. hit.note = "If the scanf format is influenceable by an attacker, it's exploitable." add_warning(hit) def normal(hit): add_warning(hit) # "c_ruleset": the rules for identifying "hits" in C (potential warnings). # It's a dictionary, where the key is the function name causing the hit, # and the value is a tuple with the following format: # (hook, level, warning, suggestion, category, {other}) # See the definition for class "Hit". # The key can have multiple values separated with "|". c_ruleset = { "strcpy" : (c_buffer, 4, "does not check for buffer overflows", "Consider using strncpy or strlcpy", "buffer", "", {}), "strcat" : (c_buffer, 4, "does not check for buffer overflows", "Consider using strncat or strlcat", "buffer", "", {}), "gets": (normal, 5, "does not check for buffer overflows", "Use fgets() instead", "buffer", "", {'input' : 1}), # The "sprintf" hook will raise "format" issues instead if appropriate: "sprintf|vsprintf": (c_sprintf, 4, "does not check for buffer overflows", "Use snprintf or vsnprintf", "buffer", "", {}), "printf|vprintf": (c_printf, 4, "if format strings can be influenced by an attacker, they can be " "exploited", "Use a constant for the format specification", "format", "", {}), "fprintf|vfprintf": (c_printf, 4, "if format strings can be influenced by an attacker, they can be " "exploited", "Use a constant for the format specification", "format", "", { 'format_position' : 2}), "snprintf|vsnprintf": (c_printf, 4, "if format strings can be influenced by an attacker, they can be " "exploited", "Use a constant for the format specification", "format", "", { 'format_position' : 3}), "scanf|vscanf": (c_scanf, 4, "the scanf() family's %s operation, without a limit specification, " "permits buffer overflows", "Specify a limit to %s, or use a different input function", "buffer", "", {'input' : 1}), "fscanf|sscanf|vsscanf|vfscanf": (c_scanf, 4, "the scanf() family's %s operation, without a limit specification, " "permits buffer overflows", "Specify a limit to %s, or use a different input function", "buffer", "", {'input' : 1, 'format_position' : 2}), "realpath|getopt|getpass|streadd|strecpy|strtrns|getwd": (normal, 3, "this does not protect against buffer overflows " "by itself, so use with caution", "", "buffer", "dangers-c", {}), "access": # ???: TODO: analyze TOCTOU more carefully. (normal, 4, "this usually indicates a security flaw. If an " "attacker can change anything along the path between the " "call to access() and the file's actual use (e.g., by moving " "files), the attacker can exploit the race condition", "Set up the correct permissions (e.g., using setuid()) and " "try to open the file directly", "race", "avoid-race#atomic-filesystem", {}), "chown": (normal, 5, "this accepts filename arguments; if an attacker " "can move those files, a race condition results. ", "Use fchown( ) instead", "race", "", {}), "chgrp": (normal, 5, "this accepts filename arguments; if an attacker " "can move those files, a race condition results. ", "Use fchgrp( ) instead", "race", "", {}), "chmod": (normal, 5, "this accepts filename arguments; if an attacker " "can move those files, a race condition results. ", "Use fchmod( ) instead", "race", "", {}), "vfork": (normal, 2, "on some old systems, vfork() permits race conditions, and it's " "very difficult to use correctly", "Use fork() instead", "race", "", {}), "tmpfile": (normal, 2, "tmpfile() has a security flaw on some systems (e.g., older " "System V systems)", "", "tmpfile", "", {}), "tmpnam|tempnam": (normal, 3, "temporary file race condition", "", "tmpfile", "avoid-race", {}), # TODO: Detect GNOME approach to mktemp and ignore it. "mktemp": (normal, 4, "temporary file race condition", "", "tmpfile", "avoid-race", {}), # TODO: Need to detect varying levels of danger. "execl|execlp|execle|execv|execvp|system|popen": (normal, 4, "this calls out to a new process and is difficult to use safely", "try using a library call that implements the same functionality " "if available.", "tmpfile", "", {}), # TODO: There are many more rules to add, esp. for TOCTOU. } template_ruleset = { # This is a template for adding new entries (the key is impossible): "9": (normal, 2, "", "", "tmpfile", "", {}), } def find_column(text, position): "Find column number inside line." newline = string.rfind(text, "\n", 0, position) if newline == -1: return position + 1 else: return position - newline def get_context(text, position): "Get surrounding text line starting from text[position]" linestart = string.rfind(text, "\n", 0, position+1) + 1 lineend = string.find(text, "\n", position, len(text)) if lineend == -1: lineend = len(text) return text[linestart:lineend] def process_directive(): "Given a directive, process it." # TODO: Currently this is just a stub routine that simply removes # hits from the current line, if any, and sets a flag if not. hitfound = 0 # Iterate backwards over hits, to be careful about the destructive iterator for i in xrange(len(hitlist)-1, -1, -1): if hitlist[i].line == linenumber: del hitlist[i] # DESTROY - this is a DESTRUCTIVE iterator. hitfound = 1 # Don't break, because there may be more than one. if not hitfound: ignoreline = linenumber + 1 # Nothing found - ignore next line. # Characters that can be in a string. # 0x4, 4.4e4, etc. numberset=string.hexdigits+"_x.Ee" # Patterns for various circumstances: p_include = re.compile( r'#\s*include\s+(<.*?>|".*?")' ) p_digits = re.compile( r'[0-9]' ) p_alphaunder = re.compile( r'[A-Za-z_]' ) # Alpha chars and underline. # A "word" in C. Note that "$" is permitted -- it's not permitted by the # C standard in identifiers, but gcc supports it as an extension. p_c_word = re.compile( r'[A-Za-z_][A-Za-z_0-9$]*' ) # We'll recognize ITS4 directives, as well as our own, # for compatibility's sake: p_directive = re.compile( r'(?i)\s*(ITS4|Flawfinder):\s*([^\*]*)' ) def process_c_file(f): global filename, linenumber, ignoreline filename=f linenumber = 1 ignoreline = -1 incomment = 0 instring = 0 linebegin = 1 if f == "-": input = sys.stdin else: input = open(f, "r") # Read ENTIRE file into memory. Use readlines() to convert \n if necessary. # This turns out to be very fast in Python, even on large files, and it # eliminates lots of range checking later, making the result faster. text = string.join(input.readlines(),"") i = 0 while i < len(text): # This is a trivial tokenizer that just tries to find "words", which # match [A-Za-z_][A-Za-z0-9_]*. It skips comments & strings. # It also skips "#include <...>", which must be handled specially # because "<" and ">" aren't usually delimiters. # It doesn't bother to tokenize anything else, since it's not used. # The following is a state machine with 3 states: incomment, instring, # and "normal", and a separate state "linebegin" if at BOL. if linebegin: # If at beginning of line, see if #include is there. linebegin = 0 m = p_include.match(text,i) if m: # Found #include, skip it. Otherwise: #include i = m.end(0) continue c = text[i] if c == "\n": linenumber = linenumber + 1 linebegin = 1 i = i + 1 # From here on, text[i] points to next character. if c == " ": continue # Skip whitespace. if i < len(text): nextc = text[i] else: nextc = '' if incomment: if c=='*' and nextc=='/': i = i + 1 incomment = 0 elif instring: if c == '\\' and nextc == '"': i = i + 1 elif c == '"': instring = 0 else: if c=='/' and nextc=='*': m = p_directive.match(text, i) # Is there a directive here? if m: process_directive() i = i + 1 incomment = 1 elif c=='/' and nextc=='/': # "//" comments - skip to EOL. m = p_directive.match(text, i) # Is there a directive here? if m: process_directive() while i