From ecaaeb2af6f63e810d693a1c2bb6ddb64a019fcc Mon Sep 17 00:00:00 2001 From: manyue Date: Wed, 20 Jun 2012 11:26:23 +0100 Subject: [PATCH] One Pass Statistics (cherry picked from commit 005fba9c76931db790a87b9bd5924da9fedd5bb6) Signed-off-by: Christoph Ruegg --- src/Numerics/Statistics/Correlation.cs | 29 ++- .../Statistics/DescriptiveStatistics.cs | 213 ++++++++---------- src/Numerics/Statistics/Statistics.cs | 20 +- 3 files changed, 134 insertions(+), 128 deletions(-) diff --git a/src/Numerics/Statistics/Correlation.cs b/src/Numerics/Statistics/Correlation.cs index bd31c401..67abb60a 100644 --- a/src/Numerics/Statistics/Correlation.cs +++ b/src/Numerics/Statistics/Correlation.cs @@ -32,6 +32,7 @@ namespace MathNet.Numerics.Statistics { using System; using System.Collections.Generic; + using Properties; /// /// A class with correlation measures between two datasets. @@ -49,12 +50,10 @@ namespace MathNet.Numerics.Statistics int n = 0; double r = 0.0; - // BUG: PERFORMANCE degraded due to tripple iteration over both IEnumerables - - double meanA = dataA.Mean(); - double meanB = dataB.Mean(); - double sdevA = dataA.StandardDeviation(); - double sdevB = dataB.StandardDeviation(); + double meanA = 0; + double meanB = 0; + double varA = 0; + double varB = 0; using (IEnumerator ieA = dataA.GetEnumerator()) using (IEnumerator ieB = dataB.GetEnumerator()) @@ -65,9 +64,21 @@ namespace MathNet.Numerics.Statistics { throw new ArgumentOutOfRangeException("dataB", "Datasets dataA and dataB need to have the same length. dataB is shorter."); } + double Acurrent = ieA.Current; + double Bcurrent = ieB.Current; - n++; - r += (ieA.Current - meanA) * (ieB.Current - meanB) / (sdevA * sdevB); + double deltaA = Acurrent - meanA; + double scaleDeltaA = deltaA / ++n; + + double deltaB = Bcurrent - meanB; + double scaleDeltaB = deltaB / n; + + meanA += scaleDeltaA; + meanB += scaleDeltaB; + + varA += scaleDeltaA * deltaA * (n - 1); + varB += scaleDeltaB * deltaB * (n - 1); + r += ((deltaA * deltaB * (n - 1)) / n); } if (ieB.MoveNext()) { @@ -75,7 +86,7 @@ namespace MathNet.Numerics.Statistics } } - return r / (n - 1); + return r / Math.Sqrt(varA * varB); } } } diff --git a/src/Numerics/Statistics/DescriptiveStatistics.cs b/src/Numerics/Statistics/DescriptiveStatistics.cs index 5d34f72f..609bf066 100644 --- a/src/Numerics/Statistics/DescriptiveStatistics.cs +++ b/src/Numerics/Statistics/DescriptiveStatistics.cs @@ -45,7 +45,8 @@ namespace MathNet.Numerics.Statistics /// Initializes a new instance of the class. /// /// The sample data. - public DescriptiveStatistics(IEnumerable data) : this(data, false) + public DescriptiveStatistics(IEnumerable data) + : this(data, false) { } @@ -53,7 +54,8 @@ namespace MathNet.Numerics.Statistics /// Initializes a new instance of the class. /// /// The sample data. - public DescriptiveStatistics(IEnumerable data) : this(data, false) + public DescriptiveStatistics(IEnumerable data) + : this(data, false) { } @@ -71,6 +73,11 @@ namespace MathNet.Numerics.Statistics /// public DescriptiveStatistics(IEnumerable data, bool increasedAccuracy) { + if (data == null) + { + throw new ArgumentNullException("data"); + } + if (increasedAccuracy) { ComputeHA(data); @@ -97,6 +104,12 @@ namespace MathNet.Numerics.Statistics /// public DescriptiveStatistics(IEnumerable data, bool increasedAccuracy) { + if (data == null) + { + throw new ArgumentNullException("data"); + } + + if (increasedAccuracy) { ComputeHA(data); @@ -171,9 +184,8 @@ namespace MathNet.Numerics.Statistics /// A sequence of datapoints. private void Compute(IEnumerable data) { - Mean = data.Mean(); + double mean = 0; double variance = 0; - double correction = 0; double skewness = 0; double kurtosis = 0; double minimum = Double.PositiveInfinity; @@ -181,40 +193,26 @@ namespace MathNet.Numerics.Statistics int n = 0; foreach (var xi in data) { - double diff = xi - Mean; - correction += diff; - double tmp = diff * diff; - variance += tmp; - tmp *= diff; - skewness += tmp; - tmp *= diff; - kurtosis += tmp; + double delta = xi - mean; + double scaleDelta = delta / ++n; + double scaleDeltaSQR = scaleDelta * scaleDelta; + double tmpDelta = delta * (n - 1); + + mean += scaleDelta; + + kurtosis += tmpDelta * scaleDelta * scaleDeltaSQR * (n * n - 3 * n + 3) + + 6 * scaleDeltaSQR * variance - 4 * scaleDelta * skewness; + + skewness += tmpDelta * scaleDeltaSQR * (n - 2) - 3 * scaleDelta * variance; + variance += tmpDelta * scaleDelta; + if (minimum > xi) { minimum = xi; } if (maximum < xi) { maximum = xi; } - n++; } + SetStatistics(mean, variance, skewness, kurtosis, minimum, maximum, n); + } - Count = n; - Minimum = minimum; - Maximum = maximum; - Variance = (variance - (correction * correction / n)) / (n - 1); - StandardDeviation = Math.Sqrt(Variance); - if (Variance != 0) - { - if (n > 2) - { - Skewness = (double)n / ((n - 1) * (n - 2)) * (skewness / (Variance * StandardDeviation)); - } - if (n > 3) - { - Kurtosis = (((double)n * (n + 1)) - / ((n - 1) * (n - 2) * (n - 3)) - * (kurtosis / (Variance * Variance))) - - ((3.0 * (n - 1) * (n - 1)) / ((n - 2) * (n - 3))); - } - } - } /// /// Computes descriptive statistics from a stream of nullable data values. @@ -222,9 +220,8 @@ namespace MathNet.Numerics.Statistics /// A sequence of datapoints. private void Compute(IEnumerable data) { - Mean = data.Mean(); + double mean = 0; double variance = 0; - double correction = 0; double skewness = 0; double kurtosis = 0; double minimum = Double.PositiveInfinity; @@ -234,43 +231,25 @@ namespace MathNet.Numerics.Statistics { if (xi.HasValue) { - double diff = xi.Value - Mean; - double tmp = diff * diff; - correction += diff; - variance += tmp; - tmp *= diff; - skewness += tmp; - tmp *= diff; - kurtosis += tmp; + double delta = xi.Value - mean; + double scaleDelta = delta / ++n; + double scaleDeltaSQR = scaleDelta * scaleDelta; + double tmpDelta = delta * (n - 1); + + mean += scaleDelta; + + kurtosis += tmpDelta * scaleDelta * scaleDeltaSQR * (n * n - 3 * n + 3) + + 6 * scaleDeltaSQR * variance - 4 * scaleDelta * skewness; + + skewness += tmpDelta * scaleDeltaSQR * (n - 2) - 3 * scaleDelta * variance; + variance += tmpDelta * scaleDelta; if (minimum > xi) { minimum = xi.Value; } if (maximum < xi) { maximum = xi.Value; } - n++; } } - Count = n; - if (n > 0) - { - Minimum = minimum; - Maximum = maximum; - Variance = (variance - (correction * correction / n)) / (n - 1); - StandardDeviation = Math.Sqrt(Variance); - if (Variance != 0) - { - if (n > 2) - { - Skewness = (double)n / ((n - 1) * (n - 2)) * (skewness / (Variance * StandardDeviation)); - } + SetStatistics(mean, variance, skewness, kurtosis, minimum, maximum, n); - if (n > 3) - { - Kurtosis = (((double)n * (n + 1)) - / ((n - 1) * (n - 2) * (n - 3)) - * (kurtosis / (Variance * Variance))) - - ((3.0 * (n - 1) * (n - 1)) / ((n - 2) * (n - 3))); - } - } - } } /// @@ -279,10 +258,8 @@ namespace MathNet.Numerics.Statistics /// A sequence of datapoints. private void ComputeHA(IEnumerable data) { - Mean = data.Mean(); - decimal mean = (decimal)Mean; + decimal mean = 0; decimal variance = 0; - decimal correction = 0; decimal skewness = 0; decimal kurtosis = 0; decimal minimum = Decimal.MaxValue; @@ -290,39 +267,24 @@ namespace MathNet.Numerics.Statistics int n = 0; foreach (decimal xi in data) { - decimal diff = xi - mean; - decimal tmp = diff * diff; - correction += diff; - variance += tmp; - tmp *= diff; - skewness += tmp; - tmp *= diff; - kurtosis += tmp; + decimal delta = xi - mean; + decimal scaleDelta = delta / ++n; + decimal scaleDeltaSQR = scaleDelta * scaleDelta; + decimal tmpDelta = delta * (n - 1); + + mean += scaleDelta; + + kurtosis += tmpDelta * scaleDelta * scaleDeltaSQR * (n * n - 3 * n + 3) + + 6 * scaleDeltaSQR * variance - 4 * scaleDelta * skewness; + + skewness += tmpDelta * scaleDeltaSQR * (n - 2) - 3 * scaleDelta * variance; + variance += tmpDelta * scaleDelta; if (minimum > xi) { minimum = xi; } if (maximum < xi) { maximum = xi; } - n++; } - Count = n; - Minimum = (double)minimum; - Maximum = (double)maximum; - Variance = (double)(variance - (correction * correction / n)) / (n - 1); - StandardDeviation = Math.Sqrt(Variance); - if (Variance != 0) - { - if (n > 2) - { - Skewness = (double)n / ((n - 1) * (n - 2)) * ((double)skewness / (Variance * StandardDeviation)); - } + SetStatistics((double)mean, (double)variance, (double)skewness, (double)kurtosis, (double)minimum, (double)maximum, n); - if (n > 3) - { - Kurtosis = (((double)n * (n + 1)) - / ((n - 1) * (n - 2) * (n - 3)) - * ((double)kurtosis / (Variance * Variance))) - - ((3.0 * (n - 1) * (n - 1)) / ((n - 2) * (n - 3))); - } - } } /// @@ -331,10 +293,8 @@ namespace MathNet.Numerics.Statistics /// A sequence of datapoints. private void ComputeHA(IEnumerable data) { - Mean = data.Mean(); - decimal mean = (decimal)Mean; + decimal mean = 0; decimal variance = 0; - decimal correction = 0; decimal skewness = 0; decimal kurtosis = 0; decimal minimum = Decimal.MaxValue; @@ -344,43 +304,66 @@ namespace MathNet.Numerics.Statistics { if (xi.HasValue) { - decimal diff = xi.Value - mean; - decimal tmp = diff * diff; - correction += diff; - variance += tmp; - tmp *= diff; - skewness += tmp; - tmp *= diff; - kurtosis += tmp; + decimal delta = xi.Value - mean; + decimal scaleDelta = delta / ++n; + decimal scaleDeltaSQR = scaleDelta * scaleDelta; + decimal tmpDelta = delta * (n - 1); + + mean += scaleDelta; + + kurtosis += tmpDelta * scaleDelta * scaleDeltaSQR * (n * n - 3 * n + 3) + + 6 * scaleDeltaSQR * variance - 4 * scaleDelta * skewness; + + skewness += tmpDelta * scaleDeltaSQR * (n - 2) - 3 * scaleDelta * variance; + variance += tmpDelta * scaleDelta; if (minimum > xi) { minimum = xi.Value; } if (maximum < xi) { maximum = xi.Value; } - n++; } } + SetStatistics((double)mean, (double)variance, (double)skewness, (double)kurtosis, (double)minimum, (double)maximum, n); + } + + /// + /// Internal use. Method use for setting the statistics. + /// + /// For setting Mean. + /// For setting Variance. + /// For setting Skewness. + /// For setting Kurtosis. + /// For setting Minimum. + /// For setting Maximum. + /// For setting Count. + private void SetStatistics(double mean, double variance, double skewness, double kurtosis, double minimum, double maximum, int n) + { + Mean = mean; Count = n; if (n > 0) { - Minimum = (double) minimum; - Maximum = (double) maximum; - Variance = (double)(variance - (correction * correction / n)) / (n - 1); - StandardDeviation = Math.Sqrt(Variance); + Minimum = minimum; + Maximum = maximum; + if (n > 1) + { + Variance = variance / (n - 1); + StandardDeviation = Math.Sqrt(Variance); + } if (Variance != 0) { if (n > 2) { - Skewness = (double)n / ((n - 1) * (n - 2)) * ((double)skewness / (Variance * StandardDeviation)); + Skewness = (double)n / ((n - 1) * (n - 2)) * (skewness / (Variance * StandardDeviation)); } if (n > 3) { Kurtosis = (((double)n * (n + 1)) / ((n - 1) * (n - 2) * (n - 3)) - * ((double)kurtosis / (Variance * Variance))) + * (kurtosis / (Variance * Variance))) - ((3.0 * (n - 1) * (n - 1)) / ((n - 2) * (n - 3))); } } } } } + } diff --git a/src/Numerics/Statistics/Statistics.cs b/src/Numerics/Statistics/Statistics.cs index a95c2732..bba51a97 100644 --- a/src/Numerics/Statistics/Statistics.cs +++ b/src/Numerics/Statistics/Statistics.cs @@ -32,6 +32,7 @@ namespace MathNet.Numerics.Statistics { using System; using System.Collections.Generic; + using System.Linq; using Properties; /// @@ -455,7 +456,7 @@ namespace MathNet.Numerics.Statistics if (dataArray.Count % 2 == 0) { double lower = OrderSelect(dataArray, 0, dataArray.Count - 1, index - 1); - double upper = OrderSelect(dataArray, 0, dataArray.Count - 1, index); + double upper = dataArray.Skip(index - 1).Minimum(); return (lower + upper) / 2.0; } @@ -543,12 +544,23 @@ namespace MathNet.Numerics.Statistics return samples[left]; } - // The pivot point. + + // The pivot point. Choose median of left, right and center + //to be the pivot and arrange so that + //samples[left]<=samples[right]<=samples[center] + int center = (left + right) / 2; + if (samples[center] < samples[left]) + Sorting.Swap(samples, left, center); + if (samples[center] < samples[right]) + Sorting.Swap(samples, right, center); + if (samples[right] < samples[left]) + Sorting.Swap(samples, right, left); + double pivot = samples[right]; // The partioning code. - int i = left - 1; - for (int j = left; j <= right - 1; j++) + int i = left; + for (int j = left+1; j <= right - 1; j++) { if (samples[j] <= pivot) {