From 3d039413b9e63adfb2fd2e48feed9080c243f534 Mon Sep 17 00:00:00 2001 From: Christoph Ruegg Date: Thu, 21 Mar 2013 15:17:16 +0100 Subject: [PATCH] Statistics: Array and Streaming PopulationVariance --- src/Numerics/Statistics/ArrayStatistics.cs | 28 ++++++- .../Statistics/SortedArrayStatistics.cs | 36 ++++----- src/Numerics/Statistics/Statistics.cs | 75 ++----------------- .../Statistics/StreamingStatistics.cs | 43 +++++++++-- .../StatisticsTests/StatisticsTests.cs | 10 +++ 5 files changed, 97 insertions(+), 95 deletions(-) diff --git a/src/Numerics/Statistics/ArrayStatistics.cs b/src/Numerics/Statistics/ArrayStatistics.cs index b37818cc..ccb5afb4 100644 --- a/src/Numerics/Statistics/ArrayStatistics.cs +++ b/src/Numerics/Statistics/ArrayStatistics.cs @@ -93,7 +93,7 @@ namespace MathNet.Numerics.Statistics ulong m = 0; for (int i = 0; i < data.Length; i++) { - mean += (data[i] - mean) / ++m; + mean += (data[i] - mean)/++m; } return mean; } @@ -114,10 +114,32 @@ namespace MathNet.Numerics.Statistics for (int i = 1; i < data.Length; i++) { t += data[i]; - double diff = ((i + 1) * data[i]) - t; - variance += (diff * diff) / ((i + 1) * i); + double diff = ((i + 1)*data[i]) - t; + variance += (diff*diff)/((i + 1)*i); } return variance/(data.Length - 1); } + + /// + /// Estimates the biased population variance from the unsorted data array. + /// On a dataset of size N will use an N normalizer + /// Returns NaN if data is empty or any entry is NaN. + /// + /// Sample array, no sorting is assumed. + public static double PopulationVariance(double[] data) + { + if (data == null) throw new ArgumentNullException("data"); + if (data.Length == 0) return double.NaN; + + double variance = 0; + double t = data[0]; + for (int i = 1; i < data.Length; i++) + { + t += data[i]; + double diff = ((i + 1)*data[i]) - t; + variance += (diff*diff)/((i + 1)*i); + } + return variance/data.Length; + } } } \ No newline at end of file diff --git a/src/Numerics/Statistics/SortedArrayStatistics.cs b/src/Numerics/Statistics/SortedArrayStatistics.cs index 9cabbd79..fe9ab77c 100644 --- a/src/Numerics/Statistics/SortedArrayStatistics.cs +++ b/src/Numerics/Statistics/SortedArrayStatistics.cs @@ -42,8 +42,8 @@ namespace MathNet.Numerics.Statistics public static class SortedArrayStatistics { - const double Third = 1d / 3d; - const double Half = 1d / 2d; + const double Third = 1d/3d; + const double Half = 1d/2d; /// /// Returns the smallest value from the sorted data array (ascending). @@ -88,7 +88,7 @@ namespace MathNet.Numerics.Statistics /// Percentile selector, between 0 and 100 (inclusive). public static double Percentile(double[] data, int p) { - return Quantile(data, p / 100d); + return Quantile(data, p/100d); } /// @@ -180,7 +180,7 @@ namespace MathNet.Numerics.Statistics case QuantileCompatibility.R2: case QuantileCompatibility.SAS5: { - double h = data.Length * tau + Half; + double h = data.Length*tau + Half; return (data[(int) Math.Ceiling(h - Half) - 1] + data[(int) (h + Half) - 1])*Half; } case QuantileCompatibility.R3: @@ -194,42 +194,42 @@ namespace MathNet.Numerics.Statistics case QuantileCompatibility.SAS1: { double h = data.Length*tau; - var hf = (int)h; - return data[hf - 1] + (h - hf) * (data[hf] - data[hf - 1]); + var hf = (int) h; + return data[hf - 1] + (h - hf)*(data[hf] - data[hf - 1]); } case QuantileCompatibility.R5: { double h = data.Length*tau + Half; - var hf = (int)h; - return data[hf - 1] + (h - hf) * (data[hf] - data[hf - 1]); + var hf = (int) h; + return data[hf - 1] + (h - hf)*(data[hf] - data[hf - 1]); } case QuantileCompatibility.R6: case QuantileCompatibility.SAS4: case QuantileCompatibility.Nist: { double h = (data.Length + 1)*tau; - var hf = (int)h; - return data[hf - 1] + (h - hf) * (data[hf] - data[hf - 1]); + var hf = (int) h; + return data[hf - 1] + (h - hf)*(data[hf] - data[hf - 1]); } case QuantileCompatibility.R7: case QuantileCompatibility.Excel: { double h = (data.Length - 1)*tau + 1d; - var hf = (int)h; - return data[hf - 1] + (h - hf) * (data[hf] - data[hf - 1]); + var hf = (int) h; + return data[hf - 1] + (h - hf)*(data[hf] - data[hf - 1]); } case QuantileCompatibility.R8: case QuantileCompatibility.Default: { - double h = (data.Length + Third) * tau + Third; - var hf = (int)h; - return data[hf - 1] + (h - hf) * (data[hf] - data[hf - 1]); + double h = (data.Length + Third)*tau + Third; + var hf = (int) h; + return data[hf - 1] + (h - hf)*(data[hf] - data[hf - 1]); } case QuantileCompatibility.R9: { - double h = (data.Length + 1d/4d) * tau + 3d/8d; - var hf = (int)h; - return data[hf - 1] + (h - hf) * (data[hf] - data[hf - 1]); + double h = (data.Length + 1d/4d)*tau + 3d/8d; + var hf = (int) h; + return data[hf - 1] + (h - hf)*(data[hf] - data[hf - 1]); } default: throw new NotSupportedException(); diff --git a/src/Numerics/Statistics/Statistics.cs b/src/Numerics/Statistics/Statistics.cs index e673b294..1c345882 100644 --- a/src/Numerics/Statistics/Statistics.cs +++ b/src/Numerics/Statistics/Statistics.cs @@ -142,34 +142,10 @@ namespace MathNet.Numerics.Statistics /// The biased population variance of the sample. public static double PopulationVariance(this IEnumerable data) { - if (data == null) - { - throw new ArgumentNullException("data"); - } - - double variance = 0; - double t = 0; - ulong j = 0; - - using (IEnumerator iterator = data.GetEnumerator()) - { - if (iterator.MoveNext()) - { - j++; - t = iterator.Current; - } - - while (iterator.MoveNext()) - { - j++; - double xi = iterator.Current; - t += xi; - double diff = (j * xi) - t; - variance += (diff * diff) / (j * (j - 1)); - } - } - - return variance / j; + var array = data as double[]; + return array != null + ? ArrayStatistics.PopulationVariance(array) + : StreamingStatistics.PopulationVariance(data); } /// @@ -179,47 +155,8 @@ namespace MathNet.Numerics.Statistics /// The population variance of the sample. public static double PopulationVariance(this IEnumerable data) { - if (data == null) - { - throw new ArgumentNullException("data"); - } - - double variance = 0; - double t = 0; - ulong j = 0; - - using (IEnumerator iterator = data.GetEnumerator()) - { - while (true) - { - bool hasNext = iterator.MoveNext(); - if (!hasNext) - { - break; - } - - if (iterator.Current.HasValue) - { - j++; - t = iterator.Current.Value; - break; - } - } - - while (iterator.MoveNext()) - { - if (iterator.Current.HasValue) - { - j++; - double xi = iterator.Current.Value; - t += xi; - double diff = (j * xi) - t; - variance += (diff * diff) / (j * (j - 1)); - } - } - } - - return variance / j; + if (data == null) throw new ArgumentNullException("data"); + return StreamingStatistics.PopulationVariance(data.Where(d => d.HasValue).Select(d => d.Value)); } /// diff --git a/src/Numerics/Statistics/StreamingStatistics.cs b/src/Numerics/Statistics/StreamingStatistics.cs index 5f717a2b..86146c25 100644 --- a/src/Numerics/Statistics/StreamingStatistics.cs +++ b/src/Numerics/Statistics/StreamingStatistics.cs @@ -54,7 +54,7 @@ namespace MathNet.Numerics.Statistics } any = true; } - return any ? min : double.NaN; + return any ? min : double.NaN; } /// @@ -93,7 +93,7 @@ namespace MathNet.Numerics.Statistics bool any = false; foreach (var d in stream) { - mean += (d - mean) / ++m; + mean += (d - mean)/++m; any = true; } return any ? mean : double.NaN; @@ -125,11 +125,44 @@ namespace MathNet.Numerics.Statistics j++; double xi = iterator.Current; t += xi; - double diff = (j * xi) - t; - variance += (diff * diff) / (j * (j - 1)); + double diff = (j*xi) - t; + variance += (diff*diff)/(j*(j - 1)); } } - return j > 1 ? variance / (j - 1) : double.NaN; + return j > 1 ? variance/(j - 1) : double.NaN; + } + + /// + /// Estimates the biased population variance from the enumerable, in a single pass without memoization. + /// On a dataset of size N will use an N normalizer + /// Returns NaN if data is empty or any entry is NaN. + /// + /// Sample stream, no sorting is assumed. + public static double PopulationVariance(IEnumerable stream) + { + if (stream == null) throw new ArgumentNullException("stream"); + + double variance = 0; + double t = 0; + ulong j = 0; + using (var iterator = stream.GetEnumerator()) + { + if (iterator.MoveNext()) + { + j++; + t = iterator.Current; + } + + while (iterator.MoveNext()) + { + j++; + double xi = iterator.Current; + t += xi; + double diff = (j*xi) - t; + variance += (diff*diff)/(j*(j - 1)); + } + } + return variance/j; } } } diff --git a/src/UnitTests/StatisticsTests/StatisticsTests.cs b/src/UnitTests/StatisticsTests/StatisticsTests.cs index f3a0c192..7b4e31ef 100644 --- a/src/UnitTests/StatisticsTests/StatisticsTests.cs +++ b/src/UnitTests/StatisticsTests/StatisticsTests.cs @@ -84,11 +84,13 @@ namespace MathNet.Numerics.UnitTests.StatisticsTests Assert.Throws(() => ArrayStatistics.Maximum(data)); Assert.Throws(() => ArrayStatistics.Mean(data)); Assert.Throws(() => ArrayStatistics.Variance(data)); + Assert.Throws(() => ArrayStatistics.PopulationVariance(data)); Assert.Throws(() => StreamingStatistics.Minimum(data)); Assert.Throws(() => StreamingStatistics.Maximum(data)); Assert.Throws(() => StreamingStatistics.Mean(data)); Assert.Throws(() => StreamingStatistics.Variance(data)); + Assert.Throws(() => StreamingStatistics.PopulationVariance(data)); } [Test] @@ -120,11 +122,13 @@ namespace MathNet.Numerics.UnitTests.StatisticsTests Assert.DoesNotThrow(() => ArrayStatistics.Maximum(data)); Assert.DoesNotThrow(() => ArrayStatistics.Mean(data)); Assert.DoesNotThrow(() => ArrayStatistics.Variance(data)); + Assert.DoesNotThrow(() => ArrayStatistics.PopulationVariance(data)); Assert.DoesNotThrow(() => StreamingStatistics.Minimum(data)); Assert.DoesNotThrow(() => StreamingStatistics.Maximum(data)); Assert.DoesNotThrow(() => StreamingStatistics.Mean(data)); Assert.DoesNotThrow(() => StreamingStatistics.Variance(data)); + Assert.DoesNotThrow(() => StreamingStatistics.PopulationVariance(data)); } [TestCase("lottery")] @@ -298,6 +302,12 @@ namespace MathNet.Numerics.UnitTests.StatisticsTests Assert.That(Statistics.PopulationVariance(new double[0]), Is.NaN); Assert.That(Statistics.PopulationVariance(new[] { 2d }), Is.Not.NaN); Assert.That(Statistics.PopulationVariance(new[] { 2d, 3d }), Is.Not.NaN); + Assert.That(ArrayStatistics.PopulationVariance(new double[0]), Is.NaN); + Assert.That(ArrayStatistics.PopulationVariance(new[] { 2d }), Is.Not.NaN); + Assert.That(ArrayStatistics.PopulationVariance(new[] { 2d, 3d }), Is.Not.NaN); + Assert.That(StreamingStatistics.PopulationVariance(new double[0]), Is.NaN); + Assert.That(StreamingStatistics.PopulationVariance(new[] { 2d }), Is.Not.NaN); + Assert.That(StreamingStatistics.PopulationVariance(new[] { 2d, 3d }), Is.Not.NaN); } ///