001/*- 002 * Copyright 2016 Diamond Light Source Ltd. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the Eclipse Public License v1.0 006 * which accompanies this distribution, and is available at 007 * http://www.eclipse.org/legal/epl-v10.html 008 */ 009 010package org.eclipse.january.dataset; 011 012/** 013 * Estimators of the scale of a Dataset. 014 * <p> 015 * A class of static methods to produce estimations of the scale of variation within a Dataset. 016 * The available estimators are: 017 * <ul> 018 * <li> Median Absolute Deviation </li> 019 * <li> S<sub>n</sub> of Croux and Rousseeuw (1992).</li> 020 * </ul> 021 * <p> 022 * Croux, C. and P. J. Rousseeuw, "Time-efficient algorithms for two highly robust estimators of scale", Computational Statistics, Volume 1, eds. Y. Dodge and J.Whittaker, Physica-Verlag, Heidelberg, pp411--428 (1992). 023 */ 024public class Outliers { 025 026 private final static double MADSCALEFACTOR = 1.4826; 027 private final static double SNSCALEFACTOR = 1.1926; 028 029 /** 030 * Returns the Median Absolute Deviation (MAD) and the median. 031 * @param data 032 * The data for which the median and the MAD are to be calculated 033 * @return A two-element array of doubles, consisting of the MAD and the median of the data 034 */ 035 public static double[] medianAbsoluteDeviation(Dataset data) { 036 037 double median = (Double)Stats.median(data); 038 data = Maths.subtract(data, median); 039 data = Maths.abs(data); 040 double median2 = (Double)Stats.median(data); 041 double mad = MADSCALEFACTOR * median2; 042 043 return new double[]{mad, median}; 044 } 045 046 /** 047 * Returns the Sn estimator of Croux and Rousseeuw. 048 * <p> 049 * This is the simple O(n²) version of the calculation algorithm. 050 * @param data 051 * The data for which the estimator is to be calculated. 052 * @return The value of the Sn estimator for the data 053 */ 054 public static double snNaive(Dataset data) { 055 056 Dataset medAbs = DatasetFactory.zeros(data); 057 Dataset dif = DatasetFactory.zeros(data); 058 059 IndexIterator it = data.getIterator(); 060 int count = 0; 061 while (it.hasNext()) { 062 double val = data.getElementDoubleAbs(it.index); 063 Maths.subtract(data, val, dif); 064 Maths.abs(dif,dif); 065 //Lower median - Math.floor((n/2)+1) of sorted 066 dif.sort(null); 067 medAbs.setObjectAbs(count++, lowMed(dif)); 068 } 069 070 071 //Higher median - Math.floor((n+1)/2) of sorted 072 medAbs.sort(null); 073 double median = highMed(medAbs); 074 075 return median * SNSCALEFACTOR; 076 } 077 078 /** 079 * Returns the Sn estimator of Croux and Rousseeuw. 080 * <p> 081 * This is the complex O(nlog n) version of the calculation algorithm. 082 * @param data 083 * The data for which the estimator is to be calculated. 084 * @return The value of the Sn estimator for the data 085 */ 086 public static double snFast(Dataset data) { 087 088 Dataset sorted = data.clone(); 089 sorted.sort(null); 090 091 Dataset medAbs = DatasetFactory.zeros(data); 092 093 IndexIterator it = data.getIterator(); 094 int count = 0; 095 while (it.hasNext()) { 096 MedianOfTwoSortedSets snuff = new MedianForSn(sorted, it.index); 097 medAbs.setObjectAbs(count++, snuff.get()); 098 } 099 100 101 //Higher median - Math.floor((n+1)/2) of sorted 102 medAbs.sort(null); 103 double median = highMed(medAbs); 104 105 return median * SNSCALEFACTOR; 106 } 107 108 /** 109 * Returns the lomed 110 * <p> 111 * Returns the lomed (low median) of a sorted Dataset. 112 * @param data 113 * A sorted Dataset for which the low median is to be calculated. 114 * @return 115 * The value of the lomed of the data 116 */ 117 public static double lowMed(Dataset data) { 118 return data.getElementDoubleAbs((int)Math.floor((data.getSize()/2))); 119 } 120 121 /** 122 * Returns the himed 123 * <p> 124 * Returns the himed (high median) of a sorted Dataset. 125 * @param data 126 * A sorted Dataset for which the low median is to be calculated. 127 * @return 128 * The value of the himed of the data 129 */ 130 public static double highMed(Dataset data) { 131 return data.getElementDoubleAbs((int)Math.floor((data.getSize()+1)/2-1)); 132 } 133 134 /** 135 * Calculates the overall median of two double arrays 136 * @param a 137 * @param b 138 * the two arrays for which the overall median is desired. 139 * @return the overall median of the two arrays 140 */ 141 public static double medianOFTwoPrimitiveArrays (double[] a, double[] b) { 142 MedianOfTwoArrays medio = new MedianOfTwoArrays(a, b); 143 return medio.get(); 144 } 145} 146 147/** 148 * Allows the calculation of the median of two arrays. 149 * <p> 150 * Subclasses must implement getA() and getB() to return the elements of A or B 151 * at the given index. The length of a must be less than or equal to that of b. 152 * The constructor must set the sizes nA and nB. 153 */ 154abstract class MedianOfTwoSortedSets{ 155 int nB, nA, diff, diffLeft; 156 157 public final double get() { 158 // Initialize the left and right markers for the set of candidate 159 // values. These are inclusive on both left and right. 160 int leftA = 0, leftB = 0; 161 @SuppressWarnings("unused") // keep rightA for symmetry 162 int rightA = nB-1, rightB = nB-1; 163 164 while (nB > 1) { 165 // For 0-based indexing, the lomed is the element at floor((n+1)/2)-1 166 int medianIndex = (int) Math.floor((nB+1)/2)-1; 167 int medianAIndex = leftA + medianIndex, 168 medianBIndex = leftB + medianIndex; 169 double medA = getAm(medianAIndex), 170 medB = getBm(medianBIndex); 171 172 int smallerShift = 0; 173 if (nB % 2 == 0) { 174 // N even: the smaller lomed, as well as anything smaller than it, cannot be the overall median 175 smallerShift = +1; 176 } 177 178 if (medA >= medB) { 179 rightA = medianAIndex; 180 leftB = medianBIndex + smallerShift; 181 } else { 182 rightB = medianBIndex; 183 leftA = medianAIndex + smallerShift; 184 } 185 186 // Different lengths 187 // It should be floor((l_m-1 + 1)/2)) 188 // this is newLength, defined above 189 // Difference between left and right 190 nB = rightB - leftB + 1; 191 } 192 193 // when the array is length 1, right and left will be the same. 194 // The lomed of a two element array is the smaller of the two 195 return Math.min(getAm(leftA), getBm(leftB)); 196 } 197 198 // Get the value in the expanded array 199 private double getAm(int i) { 200 int firstElement = diffLeft, 201 lastElement = diffLeft + nA - 1; 202 if (i < firstElement) { 203 return Double.NEGATIVE_INFINITY; 204 } else if (i > lastElement) { 205 return Double.POSITIVE_INFINITY; 206 } else { 207 return getA(i - diffLeft); 208 } 209 } 210 211 private double getBm(int i) { 212 return getB(i); 213 } 214 215 // Get the values in the original arrays 216 protected abstract double getA(int i); 217 protected abstract double getB(int i); 218 219 // Call this to set up the length difference variables. 220 protected void setDiffs() { 221 diff = nB - nA; 222 diffLeft = diff/2; 223 } 224} 225 226class MedianOfTwoArrays extends MedianOfTwoSortedSets { 227 228 double[] a, b; 229 230 public MedianOfTwoArrays(double[] ain, double[] bin) { 231 if (bin.length >= ain.length) { 232 this.a = ain; 233 nA = ain.length; 234 this.b = bin; 235 nB = bin.length; 236 } else { 237 this.a = bin; 238 nA = bin.length; 239 this.b = ain; 240 nB = ain.length; 241 } 242 setDiffs(); 243 } 244 @Override 245 protected double getA(int i) { 246 return a[i]; 247 } 248 @Override 249 protected double getB(int i) { 250 return b[i]; 251 } 252} 253 254class MedianForSn extends MedianOfTwoSortedSets { 255 Dataset xj; 256 int referenceIndex; 257 boolean lowerIsBigger; 258 259 public MedianForSn(Dataset xj, int referenceIndex) { 260 this.xj = xj; 261 this.referenceIndex = referenceIndex; 262 263 // determine which of the two halves of the array is larger 264 int lowerSize = referenceIndex, upperSize = xj.getSize() - referenceIndex - 1; 265 lowerIsBigger = lowerSize > upperSize; 266 267 // Set the array sizes 268 if (lowerIsBigger) { 269 nA = upperSize; 270 nB = lowerSize; 271 } else { 272 nA = lowerSize; 273 nB = upperSize; 274 } 275 setDiffs(); 276 } 277 278 @Override 279 protected double getA(int i) { 280 return (!lowerIsBigger) ? getLower(i) : getUpper(i); 281 } 282 @Override 283 protected double getB(int i) { 284 return (!lowerIsBigger) ? getUpper(i) : getLower(i); 285 } 286 287 private double getLower(int i) { 288 return xj.getDouble(referenceIndex) - xj.getDouble(referenceIndex - 1 - i); 289 } 290 private double getUpper(int i) { 291 return xj.getDouble(i + referenceIndex + 1) - xj.getDouble(referenceIndex); 292 } 293}