impala/tests/util/calculation_util.py

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Utility functions for calculating common mathematical measurements. Note that although
# some of these functions are available in external python packages (ex. numpy), these
# are simple enough that it is better to implement them ourselves to avoid extra
# dependencies.

from __future__ import absolute_import, division, print_function
from builtins import range
import math
import random
import string

def calculate_avg(values):
  return sum(values) / float(len(values))

def calculate_stddev(values):
  """Return the standard deviation of a numeric iterable."""
  avg = calculate_avg(values)
  return math.sqrt(calculate_avg([(val - avg)**2 for val in values]))

def calculate_median(values):
  """Return the median of a numeric iterable."""
  if all([v is None for v in values]): return None
  sorted_values = sorted(values)
  length = len(sorted_values)
  if length % 2 == 0:
    return (sorted_values[length // 2] + sorted_values[length // 2 - 1]) / 2
  else:
    return sorted_values[length // 2]

def calculate_geomean(values):
  """ Calculates the geometric mean of the given collection of numerics """
  if len(values) > 0:
    product = 1.0
    exponent = 1.0 / len(values)
    for value in values:
      product *= value ** exponent
    return product

def calculate_tval(avg, stddev, iters, ref_avg, ref_stddev, ref_iters):
  """
  Calculates the t-test t value for the given result and refrence.

  Uses the Welch's t-test formula. For more information see:
  http://en.wikipedia.org/wiki/Student%27s_t-distribution#Table_of_selected_values
  http://en.wikipedia.org/wiki/Student's_t-test
  """
  # SEM (standard error mean) = sqrt(var1/N1 + var2/N2)
  # t = (X1 - X2) / SEM
  sem = math.sqrt((math.pow(stddev, 2) / iters) + (math.pow(ref_stddev, 2) / ref_iters))
  return (avg - ref_avg) / sem

def get_random_id(length):
  return ''.join(
      random.choice(string.ascii_uppercase + string.digits) for _ in range(length))


def calculate_mwu(samples, ref_samples):
  """
  Calculates the Mann-Whitney U Test Z value for the given samples and reference.
  """
  tag_a = [(s, 'A') for s in samples]
  tab_b = [(s, 'B') for s in ref_samples]
  ab = tag_a + tab_b
  ab.sort()
  # Assume no ties
  u = 0
  count_b = 0
  for v in ab:
    if v[1] == 'A':
      u += count_b
    else:
      count_b += 1
  # u is normally distributed with the following mean and standard deviation:
  mean = len(samples) * len(ref_samples) / 2.0
  stddev = math.sqrt(len(samples) * len(ref_samples) * (1 + len(ab)) / 12.0)
  return (u - mean) / stddev