[Findit] Plain objects to represent and parse stack trace.

NOTRY=true Review URL: https://codereview.chromium.org/430943003 Cr-Commit-Position: refs/heads/master@{#288270} git-svn-id: svn://svn.chromium.org/chrome/trunk/src@288270 0039d316-1c4b-4281-b951-d872f2087c98

[Findit] Plain objects to represent and parse stack trace.
NOTRY=true Review URL: https://codereview.chromium.org/430943003 Cr-Commit-Position: refs/heads/master@{#288270} git-svn-id: svn://svn.chromium.org/chrome/trunk/src@288270 0039d316-1c4b-4281-b951-d872f2087c98
12a87df1 · jeun@chromium.org · 42a54f01 · 12a87df1 · 12a87df1 · 12a87df1
Commit 12a87df1 authored Aug 08, 2014 by jeun@chromium.org
Showing with 672 additions and 0 deletions

tools/findit/component_dictionary.py tools/findit/component_dictionary.py +111 -0

tools/findit/crash_utils.py tools/findit/crash_utils.py +285 -0

tools/findit/stacktrace.py tools/findit/stacktrace.py +276 -0

No files found.
--- a/tools/findit/component_dictionary.py
+++ b/tools/findit/component_dictionary.py
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+import os
+class FileDictionary(object):
+  """Maps file in a stacktrace to its crash information.
+  It maps file to another dictionary, which maps the file's path to crashed
+  lines, stack frame indices and crashed functions.
+  """
+  def __init__(self):
+    """Initializes the file dictionary."""
+    self.file_dict = {}
+  def AddFile(self, file_name, file_path, crashed_line_number,
+              stack_frame_index, function):
+    """Adds file and its crash information to the map.
+    Args:
+      file_name: The name of the crashed file.
+      file_path: The path of the crashed file.
+      crashed_line_number: The crashed line of the file.
+      stack_frame_index: The file's position in the callstack.
+      function: The name of the crashed function.
+    """
+    # Populate the dictionary if this file/path has not been added before.
+    if file_name not in self.file_dict:
+      self.file_dict[file_name] = {}
+    if file_path not in self.file_dict[file_name]:
+      self.file_dict[file_name][file_path] = {}
+      self.file_dict[file_name][file_path]['line_numbers'] = []
+      self.file_dict[file_name][file_path]['stack_frame_indices'] = []
+      self.file_dict[file_name][file_path]['function'] = []
+    # Add the crashed line, frame index and function name.
+    self.file_dict[file_name][file_path]['line_numbers'].append(
+        crashed_line_number)
+    self.file_dict[file_name][file_path]['stack_frame_indices'].append(
+        stack_frame_index)
+    self.file_dict[file_name][file_path]['function'].append(function)
+  def GetPathDic(self, file_name):
+    """Returns file's path and crash information."""
+    return self.file_dict[file_name]
+  def GetCrashedLineNumbers(self, file_path):
+    """Returns crashed line numbers given a file path."""
+    file_name = os.path.basename(file_path)
+    return self.file_dict[file_name][file_path]['line_numbers']
+  def GetCrashStackFrameindex(self, file_path):
+    """Returns stack frame indices given a file path."""
+    file_name = os.path.basename(file_path)
+    return self.file_dict[file_name][file_path]['stack_frame_indices']
+  def GetCrashFunction(self, file_path):
+    """Returns list of crashed functions given a file path."""
+    file_name = os.path.basename(file_path)
+    return self.file_dict[file_name][file_path]['function']
+  def __iter__(self):
+    return iter(self.file_dict)
+class ComponentDictionary(object):
+  """Represents a file dictionary.
+  It maps each component (blink, chrome, etc) to a file dictionary.
+  """
+  def __init__(self, components):
+    """Initializes the dictionary with given components."""
+    self.component_dict = {}
+    # Create file dictionary for all the components.
+    for component in components:
+      self.component_dict[component] = FileDictionary()
+  def __iter__(self):
+    return iter(self.component_dict)
+  def GetFileDict(self, component):
+    """Returns a file dictionary for a given component."""
+    return self.component_dict[component]
+  def GenerateFileDict(self, stack_frame_list):
+    """Generates file dictionary, given an instance of StackFrame list."""
+    # Iterate through the list of stackframe objects.
+    for stack_frame in stack_frame_list:
+      # If the component of this line is not in the list of components to
+      # look for, ignore this line.
+      component = stack_frame.component
+      if component not in self.component_dict:
+        continue
+      # Get values of the variables
+      file_name = stack_frame.file_name
+      file_path = stack_frame.file_path
+      crashed_line_number = stack_frame.crashed_line_number
+      stack_frame_index = stack_frame.index
+      function = stack_frame.function
+      # Add the file to this component's dictionary of files.
+      file_dict = self.component_dict[component]
+      file_dict.AddFile(file_name, file_path, crashed_line_number,
+                       stack_frame_index, function)
--- a/tools/findit/crash_utils.py
+++ b/tools/findit/crash_utils.py
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+import cgi
+import json
+import os
+import time
+import urllib
+INFINITY = float('inf')
+def NormalizePathLinux(path):
+  """Normalizes linux path.
+  Args:
+    path: A string representing a path.
+  Returns:
+    A tuple containing a component this path is in (e.g blink, skia, etc)
+    and a path in that component's repository.
+  """
+  normalized_path = os.path.abspath(path)
+  if 'src/v8/' in normalized_path:
+    component = 'v8'
+    normalized_path = normalized_path.split('src/v8/')[1]
+  # TODO(jeun): Integrate with parsing DEPS file.
+  if 'WebKit/' in normalized_path:
+    component = 'blink'
+    normalized_path = ''.join(path.split('WebKit/')[1:])
+  else:
+    component = 'chromium'
+  if '/build/' in normalized_path:
+    normalized_path = normalized_path.split('/build/')[-1]
+  if not (normalized_path.startswith('src/') or
+      normalized_path.startswith('Source/')):
+    normalized_path = 'src/' + normalized_path
+  return (component, normalized_path)
+def SplitRange(regression):
+  """Splits a range as retrieved from clusterfuzz.
+  Args:
+    regression: A string in format 'r1234:r5678'.
+  Returns:
+    A list containing two numbers represented in string, for example
+    ['1234','5678'].
+  """
+  revisions = regression.split(':')
+  # If regression information is not available, return none.
+  if len(revisions) != 2:
+    return None
+  # Strip 'r' from both start and end range.
+  start_range = revisions[0].lstrip('r')
+  end_range = revisions[1].lstrip('r')
+  return [start_range, end_range]
+def LoadJSON(json_string):
+  """Loads json object from string, or None.
+  Args:
+    json_string: A string to get object from.
+  Returns:
+    JSON object if the string represents a JSON object, None otherwise.
+  """
+  try:
+    data = json.loads(json_string)
+  except ValueError:
+    data = None
+  return data
+def GetDataFromURL(url, retries=10, sleep_time=0.1):
+  """Retrieves raw data from URL, tries 10 times.
+  Args:
+    url: URL to get data from.
+    retries: Number of times to retry connection.
+    sleep_time: Time in seconds to wait before retrying connection.
+  Returns:
+    None if the data retrieval fails, or the raw data.
+  """
+  data = None
+  for i in range(retries):
+    # Retrieves data from URL.
+    try:
+      data = urllib.urlopen(url)
+      # If retrieval is successful, return the data.
+      if data:
+        return data.read()
+    # If retrieval fails, try after sleep_time second.
+    except IOError:
+      time.sleep(sleep_time)
+      continue
+  # Return None if it fails to read data from URL 'retries' times.
+  return None
+def FindMinLineDistance(crashed_line_list, changed_line_numbers):
+  """Calculates how far the changed line is from one of the crashes.
+  Finds the minimum distance between the lines that the file crashed on
+  and the lines that the file changed. For example, if the file crashed on
+  line 200 and the CL changes line 203,204 and 205, the function returns 3.
+  Args:
+    crashed_line_list: A list of lines that the file crashed on.
+    changed_line_numbers: A list of lines that the file changed.
+  Returns:
+    The minimum distance. If either of the input lists is empty,
+    it returns inf.
+  """
+  min_distance = INFINITY
+  for line in crashed_line_list:
+    for distance in changed_line_numbers:
+      # Find the current distance and update the min if current distance is
+      # less than current min.
+      current_distance = abs(line - distance)
+      if current_distance < min_distance:
+        min_distance = current_distance
+  return min_distance
+def GuessIfSameSubPath(path1, path2):
+  """Guesses if two paths represent same path.
+  Compares the name of the folders in the path (by split('/')), and checks
+  if they match either more than 3 or min of path lengths.
+  Args:
+    path1: First path.
+    path2: Second path to compare.
+  Returns:
+    True if it they are thought to be a same path, False otherwise.
+  """
+  path1 = path1.split('/')
+  path2 = path2.split('/')
+  intersection = set(path1).intersection(set(path2))
+  return len(intersection) >= (min(3, min(len(path1), len(path2))))
+def FindMinStackFrameNumber(stack_frame_indices, priorities):
+  """Finds the minimum stack number, from the list of stack numbers.
+  Args:
+    stack_frame_indices: A list of lists containing stack position.
+    priorities: A list of of priority for each file.
+  Returns:
+    Inf if stack_frame_indices is empty, minimum stack number otherwise.
+  """
+  # Get the indexes of the highest priority (or low priority number).
+  highest_priority = min(priorities)
+  highest_priority_indices = []
+  for i in range(len(priorities)):
+    if priorities[i] == highest_priority:
+      highest_priority_indices.append(i)
+  # Gather the list of stack frame numbers for the files that change the
+  # crash lines.
+  flattened = []
+  for i in highest_priority_indices:
+    flattened += stack_frame_indices[i]
+  # If no stack frame information is available, return inf. Else, return min.
+  if not flattened:
+    return INFINITY
+  else:
+    return min(flattened)
+def AddHyperlink(text, link):
+  """Returns a string with HTML link tag.
+  Args:
+    text: A string to add link.
+    link: A link to add to the string.
+  Returns:
+    A string with hyperlink added.
+  """
+  sanitized_link = cgi.escape(link, quote=True)
+  sanitized_text = cgi.escape(text)
+  return '<a href="%s">%s</a>' % (sanitized_link, sanitized_text)
+def PrettifyList(l):
+  """Returns a string representation of a list.
+  It adds comma in between the elements and removes the brackets.
+  Args:
+    l: A list to prettify.
+  Returns:
+    A string representation of the list.
+  """
+  return str(l)[1:-1]
+def PrettifyFiles(file_list):
+  """Returns a string representation of a list of file names.
+  Args:
+    file_list: A list of tuple, (file_name, file_url).
+  Returns:
+    A string representation of file names with their urls.
+  """
+  ret = ['\n']
+  for file_name, file_url in file_list:
+    ret.append('      %s\n' % AddHyperlink(file_name, file_url))
+  return ''.join(ret)
+def Intersection(crashed_line_list, stack_frame_index, changed_line_numbers,
+                 line_range=3):
+  """Finds the overlap betwee changed lines and crashed lines.
+  Finds the intersection of the lines that caused the crash and
+  lines that the file changes. The intersection looks within 3 lines
+  of the line that caused the crash.
+  Args:
+    crashed_line_list: A list of lines that the file crashed on.
+    stack_frame_index: A list of positions in stack for each of the lines.
+    changed_line_numbers: A list of lines that the file changed.
+    line_range: Number of lines to look backwards from crashed lines.
+  Returns:
+    line_intersection: Intersection between crashed_line_list and
+                       changed_line_numbers.
+    stack_frame_index_intersection: Stack number for each of the intersections.
+  """
+  line_intersection = []
+  stack_frame_index_intersection = []
+  # Iterate through the crashed lines, and its occurence in stack.
+  for (line, stack_frame_index) in zip(crashed_line_list, stack_frame_index):
+    # Also check previous 'line_range' lines.
+    line_minus_n = range(line - line_range, line + 1)
+    for changed_line in changed_line_numbers:
+      # If a CL does not change crahsed line, check next line.
+      if changed_line not in line_minus_n:
+        continue
+      # If the changed line is exactly the crashed line, add that line.
+      if line in changed_line_numbers:
+        intersected_line = line
+      # If the changed line is in 3 lines of the crashed line, add the line.
+      else:
+        intersected_line = changed_line
+      # Avoid adding the same line twice.
+      if intersected_line not in line_intersection:
+        line_intersection.append(intersected_line)
+        stack_frame_index_intersection.append(stack_frame_index)
+      break
+  return (line_intersection, stack_frame_index_intersection)
--- a/tools/findit/stacktrace.py
+++ b/tools/findit/stacktrace.py
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+import os
+import re
+import crash_utils
+class StackFrame(object):
+  """Represents a frame in stacktrace.
+  Attributes:
+    index: An index of the stack frame.
+    component: A component this line represents, such as blink, chrome, etc.
+    file_name: The name of the file that crashed.
+    function: The function that caused the crash.
+    file_path: The path of the crashed file.
+    crashed_line_number: The line of the file that caused the crash.
+  """
+  def __init__(self, stack_frame_index, component, file_name,
+               function, file_path, crashed_line_number):
+    self.index = stack_frame_index
+    self.component = component
+    self.file_name = file_name
+    self.function = function
+    self.file_path = file_path
+    self.crashed_line_number = crashed_line_number
+class CallStack(object):
+  """Represents a call stack within a stacktrace.
+  It is a list of StackFrame object, and the object keeps track of whether
+  the stack is crash stack, freed or previously-allocated.
+  """
+  def __init__(self, stack_priority):
+    self.frame_list = []
+    self.priority = stack_priority
+  def Add(self, stacktrace_line):
+    self.frame_list.append(stacktrace_line)
+  def GetTopNFrames(self, n):
+    return self.frame_list[:n]
+class Stacktrace(object):
+  """Represents Stacktrace object.
+  Contains a list of callstacks, because one stacktrace might have more than
+  one callstacks.
+  """
+  def __init__(self, stacktrace, build_type):
+    self.stack_list = []
+    self.ParseStacktrace(stacktrace, build_type)
+  def ParseStacktrace(self, stacktrace, build_type):
+    """Parses stacktrace and normalizes it.
+    If there are multiple callstacks within the stacktrace,
+    it will parse each of them separately, and store them in the stack_list
+    variable.
+    Args:
+      stacktrace: A string containing stacktrace.
+      build_type: A string containing the build type of the crash.
+    """
+    # If the passed in string is empty, the object does not represent anything.
+    if not stacktrace:
+      self.stack_list = None
+      return
+    # Reset the stack list.
+    self.stack_list = []
+    reached_new_callstack = False
+    # Note that we do not need exact stack frame index, we only need relative
+    # position of a frame within a callstack. The reason for not extracting
+    # index from a line is that some stack frames do not have index.
+    stack_frame_index = 0
+    current_stack = None
+    for line in stacktrace:
+      (is_new_callstack, stack_priority) = self.__IsStartOfNewCallStack(
+          line, build_type)
+      if is_new_callstack:
+        # If this callstack is crash stack, update the boolean.
+        if not reached_new_callstack:
+          reached_new_callstack = True
+          current_stack = CallStack(stack_priority)
+        # If this is from freed or allocation, add the callstack we have
+        # to the list of callstacks, and increment the stack priority.
+        else:
+          stack_frame_index = 0
+          if current_stack and current_stack.frame_list:
+            self.stack_list.append(current_stack)
+          current_stack = CallStack(stack_priority)
+      # Generate stack frame object from the line.
+      parsed_stack_frame = self.__GenerateStackFrame(
+          stack_frame_index, line, build_type)
+      # If the line does not represent the stack frame, ignore this line.
+      if not parsed_stack_frame:
+        continue
+      # Add the parsed stack frame object to the current stack.
+      current_stack.Add(parsed_stack_frame)
+      stack_frame_index += 1
+    # Add the current callstack only if there are frames in it.
+    if current_stack and current_stack.frame_list:
+      self.stack_list.append(current_stack)
+  def __IsStartOfNewCallStack(self, line, build_type):
+    """Check if this line is the start of the new callstack.
+    Since each builds have different format of stacktrace, the logic for
+    checking the line for all builds is handled in here.
+    Args:
+      line: Line to check for.
+      build_type: The name of the build.
+    Returns:
+      True if the line is the start of new callstack, False otherwise. If True,
+      it also returns the priority of the line.
+    """
+    # Currently not supported.
+    if 'android' in build_type:
+      pass
+    elif 'syzyasan' in build_type:
+      # In syzyasan build, new stack starts with 'crash stack:',
+      # 'freed stack:', etc.
+      callstack_start_pattern = re.compile(r'^(.*) stack:$')
+      match = callstack_start_pattern.match(line)
+      # If the line matches the callstack start pattern.
+      if match:
+        # Check the type of the new match.
+        stack_type = match.group(1)
+        # Crash stack gets priority 0.
+        if stack_type == 'Crash':
+          return (True, 0)
+        # Other callstacks all get priority 1.
+        else:
+          return (True, 1)
+    elif 'tsan' in build_type:
+      # Create patterns for each callstack type.
+      crash_callstack_start_pattern = re.compile(
+          r'^(Read|Write) of size \d+')
+      allocation_callstack_start_pattern = re.compile(
+          r'^Previous (write|read) of size \d+')
+      location_callstack_start_pattern = re.compile(
+          r'^Location is heap block of size \d+')
+      # Crash stack gets priority 0.
+      if crash_callstack_start_pattern.match(line):
+        return (True, 0)
+      # All other stacks get priority 1.
+      if allocation_callstack_start_pattern.match(line):
+        return (True, 1)
+      if location_callstack_start_pattern.match(line):
+        return (True, 1)
+    else:
+      # In asan and other build types, crash stack can start
+      # in two different ways.
+      crash_callstack_start_pattern1 = re.compile(r'^==\d+== ?ERROR:')
+      crash_callstack_start_pattern2 = re.compile(
+          r'^(READ|WRITE) of size \d+ at')
+      freed_callstack_start_pattern = re.compile(
+          r'^freed by thread T\d+ (.* )?here:')
+      allocation_callstack_start_pattern = re.compile(
+          r'^previously allocated by thread T\d+ (.* )?here:')
+      other_callstack_start_pattern = re.compile(
+          r'^Thread T\d+ (.* )?created by')
+      # Crash stack gets priority 0.
+      if (crash_callstack_start_pattern1.match(line) or
+          crash_callstack_start_pattern2.match(line)):
+        return (True, 0)
+      # All other callstack gets priority 1.
+      if freed_callstack_start_pattern.match(line):
+        return (True, 1)
+      if allocation_callstack_start_pattern.match(line):
+        return (True, 1)
+      if other_callstack_start_pattern.match(line):
+        return (True, 1)
+    # If the line does not match any pattern, return false and a dummy for
+    # stack priority.
+    return (False, -1)
+  def __GenerateStackFrame(self, stack_frame_index, line, build_type):
+    """Extracts information from a line in stacktrace.
+    Args:
+      stack_frame_index: A stack frame index of this line.
+      line: A stacktrace string to extract data from.
+      build_type: A string containing the build type
+                    of this crash (e.g. linux_asan_chrome_mp).
+    Returns:
+      A triple containing the name of the function, the path of the file and
+      the crashed line number.
+    """
+    line_parts = line.split()
+    try:
+      # Filter out lines that are not stack frame.
+      stack_frame_index_pattern = re.compile(r'#(\d+)')
+      if not stack_frame_index_pattern.match(line_parts[0]):
+        return None
+      # Tsan has different stack frame style from other builds.
+      if build_type.startswith('linux_tsan'):
+        file_path_and_line = line_parts[-2]
+        function = ' '.join(line_parts[1:-2])
+      else:
+        file_path_and_line = line_parts[-1]
+        function = ' '.join(line_parts[3:-1])
+      # Get file path and line info from the line.
+      file_path_and_line = file_path_and_line.split(':')
+      file_path = file_path_and_line[0]
+      crashed_line_number = int(file_path_and_line[1])
+    # Return None if the line is malformed.
+    except IndexError:
+      return None
+    except ValueError:
+      return None
+    # Normalize the file path so that it can be compared to repository path.
+    file_name = os.path.basename(file_path)
+    (component, file_path) = crash_utils.NormalizePathLinux(file_path)
+    # FIXME(jeun): Add other components.
+    if not (component == 'blink' or component == 'chromium'):
+      return None
+    # Return a new stack frame object with the parsed information.
+    return StackFrame(stack_frame_index, component, file_name, function,
+                      file_path, crashed_line_number)
+  def __getitem__(self, index):
+    return self.stack_list[index]
+  def GetCrashStack(self):
+    for callstack in self.stack_list:
+      # Only the crash stack has the priority 0.
+      if callstack.priority == 0:
+        return callstack