001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.math.random;
019
020 import java.io.BufferedReader;
021 import java.io.File;
022 import java.io.FileReader;
023 import java.io.IOException;
024 import java.io.InputStreamReader;
025 import java.io.Serializable;
026 import java.net.URL;
027 import java.util.ArrayList;
028 import java.util.List;
029
030 import org.apache.commons.math.MathRuntimeException;
031 import org.apache.commons.math.stat.descriptive.StatisticalSummary;
032 import org.apache.commons.math.stat.descriptive.SummaryStatistics;
033
034 /**
035 * Implements <code>EmpiricalDistribution</code> interface. This implementation
036 * uses what amounts to the
037 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
038 * Variable Kernel Method</a> with Gaussian smoothing:<p>
039 * <strong>Digesting the input file</strong>
040 * <ol><li>Pass the file once to compute min and max.</li>
041 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
042 * <li>Pass the data file again, computing bin counts and univariate
043 * statistics (mean, std dev.) for each of the bins </li>
044 * <li>Divide the interval (0,1) into subintervals associated with the bins,
045 * with the length of a bin's subinterval proportional to its count.</li></ol>
046 * <strong>Generating random values from the distribution</strong><ol>
047 * <li>Generate a uniformly distributed value in (0,1) </li>
048 * <li>Select the subinterval to which the value belongs.
049 * <li>Generate a random Gaussian value with mean = mean of the associated
050 * bin and std dev = std dev of associated bin.</li></ol></p><p>
051 *<strong>USAGE NOTES:</strong><ul>
052 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
053 * is to set the bin count to approximately the length of the input file divided
054 * by 10. </li>
055 *<li>The input file <i>must</i> be a plain text file containing one valid numeric
056 * entry per line.</li>
057 * </ul></p>
058 *
059 * @version $Revision: 925812 $ $Date: 2010-03-21 11:49:31 -0400 (Sun, 21 Mar 2010) $
060 */
061 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
062
063 /** Serializable version identifier */
064 private static final long serialVersionUID = 5729073523949762654L;
065
066 /** List of SummaryStatistics objects characterizing the bins */
067 private List<SummaryStatistics> binStats = null;
068
069 /** Sample statistics */
070 private SummaryStatistics sampleStats = null;
071
072 /** Max loaded value */
073 private double max = Double.NEGATIVE_INFINITY;
074
075 /** Min loaded value */
076 private double min = Double.POSITIVE_INFINITY;
077
078 /** Grid size */
079 private double delta = 0d;
080
081 /** number of bins */
082 private int binCount = 1000;
083
084 /** is the distribution loaded? */
085 private boolean loaded = false;
086
087 /** upper bounds of subintervals in (0,1) "belonging" to the bins */
088 private double[] upperBounds = null;
089
090 /** RandomData instance to use in repeated calls to getNext() */
091 private RandomData randomData = new RandomDataImpl();
092
093 /**
094 * Creates a new EmpiricalDistribution with the default bin count.
095 */
096 public EmpiricalDistributionImpl() {
097 binStats = new ArrayList<SummaryStatistics>();
098 }
099
100 /**
101 * Creates a new EmpiricalDistribution with the specified bin count.
102 *
103 * @param binCount number of bins
104 */
105 public EmpiricalDistributionImpl(int binCount) {
106 this.binCount = binCount;
107 binStats = new ArrayList<SummaryStatistics>();
108 }
109
110 /**
111 * Computes the empirical distribution from the provided
112 * array of numbers.
113 *
114 * @param in the input data array
115 */
116 public void load(double[] in) {
117 DataAdapter da = new ArrayDataAdapter(in);
118 try {
119 da.computeStats();
120 fillBinStats(in);
121 } catch (IOException e) {
122 throw new MathRuntimeException(e);
123 }
124 loaded = true;
125
126 }
127
128 /**
129 * Computes the empirical distribution using data read from a URL.
130 * @param url url of the input file
131 *
132 * @throws IOException if an IO error occurs
133 */
134 public void load(URL url) throws IOException {
135 BufferedReader in =
136 new BufferedReader(new InputStreamReader(url.openStream()));
137 try {
138 DataAdapter da = new StreamDataAdapter(in);
139 da.computeStats();
140 if (sampleStats.getN() == 0) {
141 throw MathRuntimeException.createEOFException("URL {0} contains no data",
142 url);
143 }
144 in = new BufferedReader(new InputStreamReader(url.openStream()));
145 fillBinStats(in);
146 loaded = true;
147 } finally {
148 try {
149 in.close();
150 } catch (IOException ex) {
151 // ignore
152 }
153 }
154 }
155
156 /**
157 * Computes the empirical distribution from the input file.
158 *
159 * @param file the input file
160 * @throws IOException if an IO error occurs
161 */
162 public void load(File file) throws IOException {
163 BufferedReader in = new BufferedReader(new FileReader(file));
164 try {
165 DataAdapter da = new StreamDataAdapter(in);
166 da.computeStats();
167 in = new BufferedReader(new FileReader(file));
168 fillBinStats(in);
169 loaded = true;
170 } finally {
171 try {
172 in.close();
173 } catch (IOException ex) {
174 // ignore
175 }
176 }
177 }
178
179 /**
180 * Provides methods for computing <code>sampleStats</code> and
181 * <code>beanStats</code> abstracting the source of data.
182 */
183 private abstract class DataAdapter{
184
185 /**
186 * Compute bin stats.
187 *
188 * @throws IOException if an error occurs computing bin stats
189 */
190 public abstract void computeBinStats() throws IOException;
191
192 /**
193 * Compute sample statistics.
194 *
195 * @throws IOException if an error occurs computing sample stats
196 */
197 public abstract void computeStats() throws IOException;
198
199 }
200
201 /**
202 * Factory of <code>DataAdapter</code> objects. For every supported source
203 * of data (array of doubles, file, etc.) an instance of the proper object
204 * is returned.
205 */
206 private class DataAdapterFactory{
207 /**
208 * Creates a DataAdapter from a data object
209 *
210 * @param in object providing access to the data
211 * @return DataAdapter instance
212 */
213 public DataAdapter getAdapter(Object in) {
214 if (in instanceof BufferedReader) {
215 BufferedReader inputStream = (BufferedReader) in;
216 return new StreamDataAdapter(inputStream);
217 } else if (in instanceof double[]) {
218 double[] inputArray = (double[]) in;
219 return new ArrayDataAdapter(inputArray);
220 } else {
221 throw MathRuntimeException.createIllegalArgumentException(
222 "input data comes from unsupported datasource: {0}, " +
223 "supported sources: {1}, {2}",
224 in.getClass().getName(),
225 BufferedReader.class.getName(), double[].class.getName());
226 }
227 }
228 }
229 /**
230 * <code>DataAdapter</code> for data provided through some input stream
231 */
232 private class StreamDataAdapter extends DataAdapter{
233
234 /** Input stream providing access to the data */
235 private BufferedReader inputStream;
236
237 /**
238 * Create a StreamDataAdapter from a BufferedReader
239 *
240 * @param in BufferedReader input stream
241 */
242 public StreamDataAdapter(BufferedReader in){
243 super();
244 inputStream = in;
245 }
246
247 /** {@inheritDoc} */
248 @Override
249 public void computeBinStats() throws IOException {
250 String str = null;
251 double val = 0.0d;
252 while ((str = inputStream.readLine()) != null) {
253 val = Double.parseDouble(str);
254 SummaryStatistics stats = binStats.get(findBin(val));
255 stats.addValue(val);
256 }
257
258 inputStream.close();
259 inputStream = null;
260 }
261
262 /** {@inheritDoc} */
263 @Override
264 public void computeStats() throws IOException {
265 String str = null;
266 double val = 0.0;
267 sampleStats = new SummaryStatistics();
268 while ((str = inputStream.readLine()) != null) {
269 val = Double.valueOf(str).doubleValue();
270 sampleStats.addValue(val);
271 }
272 inputStream.close();
273 inputStream = null;
274 }
275 }
276
277 /**
278 * <code>DataAdapter</code> for data provided as array of doubles.
279 */
280 private class ArrayDataAdapter extends DataAdapter {
281
282 /** Array of input data values */
283 private double[] inputArray;
284
285 /**
286 * Construct an ArrayDataAdapter from a double[] array
287 *
288 * @param in double[] array holding the data
289 */
290 public ArrayDataAdapter(double[] in){
291 super();
292 inputArray = in;
293 }
294
295 /** {@inheritDoc} */
296 @Override
297 public void computeStats() throws IOException {
298 sampleStats = new SummaryStatistics();
299 for (int i = 0; i < inputArray.length; i++) {
300 sampleStats.addValue(inputArray[i]);
301 }
302 }
303
304 /** {@inheritDoc} */
305 @Override
306 public void computeBinStats() throws IOException {
307 for (int i = 0; i < inputArray.length; i++) {
308 SummaryStatistics stats =
309 binStats.get(findBin(inputArray[i]));
310 stats.addValue(inputArray[i]);
311 }
312 }
313 }
314
315 /**
316 * Fills binStats array (second pass through data file).
317 *
318 * @param in object providing access to the data
319 * @throws IOException if an IO error occurs
320 */
321 private void fillBinStats(Object in) throws IOException {
322 // Set up grid
323 min = sampleStats.getMin();
324 max = sampleStats.getMax();
325 delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
326
327 // Initialize binStats ArrayList
328 if (!binStats.isEmpty()) {
329 binStats.clear();
330 }
331 for (int i = 0; i < binCount; i++) {
332 SummaryStatistics stats = new SummaryStatistics();
333 binStats.add(i,stats);
334 }
335
336 // Filling data in binStats Array
337 DataAdapterFactory aFactory = new DataAdapterFactory();
338 DataAdapter da = aFactory.getAdapter(in);
339 da.computeBinStats();
340
341 // Assign upperBounds based on bin counts
342 upperBounds = new double[binCount];
343 upperBounds[0] =
344 ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
345 for (int i = 1; i < binCount-1; i++) {
346 upperBounds[i] = upperBounds[i-1] +
347 ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
348 }
349 upperBounds[binCount-1] = 1.0d;
350 }
351
352 /**
353 * Returns the index of the bin to which the given value belongs
354 *
355 * @param value the value whose bin we are trying to find
356 * @return the index of the bin containing the value
357 */
358 private int findBin(double value) {
359 return Math.min(
360 Math.max((int) Math.ceil((value- min) / delta) - 1, 0),
361 binCount - 1);
362 }
363
364 /**
365 * Generates a random value from this distribution.
366 *
367 * @return the random value.
368 * @throws IllegalStateException if the distribution has not been loaded
369 */
370 public double getNextValue() throws IllegalStateException {
371
372 if (!loaded) {
373 throw MathRuntimeException.createIllegalStateException("distribution not loaded");
374 }
375
376 // Start with a uniformly distributed random number in (0,1)
377 double x = Math.random();
378
379 // Use this to select the bin and generate a Gaussian within the bin
380 for (int i = 0; i < binCount; i++) {
381 if (x <= upperBounds[i]) {
382 SummaryStatistics stats = binStats.get(i);
383 if (stats.getN() > 0) {
384 if (stats.getStandardDeviation() > 0) { // more than one obs
385 return randomData.nextGaussian
386 (stats.getMean(),stats.getStandardDeviation());
387 } else {
388 return stats.getMean(); // only one obs in bin
389 }
390 }
391 }
392 }
393 throw new MathRuntimeException("no bin selected");
394 }
395
396 /**
397 * Returns a {@link StatisticalSummary} describing this distribution.
398 * <strong>Preconditions:</strong><ul>
399 * <li>the distribution must be loaded before invoking this method</li></ul>
400 *
401 * @return the sample statistics
402 * @throws IllegalStateException if the distribution has not been loaded
403 */
404 public StatisticalSummary getSampleStats() {
405 return sampleStats;
406 }
407
408 /**
409 * Returns the number of bins.
410 *
411 * @return the number of bins.
412 */
413 public int getBinCount() {
414 return binCount;
415 }
416
417 /**
418 * Returns a List of {@link SummaryStatistics} instances containing
419 * statistics describing the values in each of the bins. The list is
420 * indexed on the bin number.
421 *
422 * @return List of bin statistics.
423 */
424 public List<SummaryStatistics> getBinStats() {
425 return binStats;
426 }
427
428 /**
429 * <p>Returns a fresh copy of the array of upper bounds for the bins.
430 * Bins are: <br/>
431 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
432 * (upperBounds[binCount-2], upperBounds[binCount-1] = max].</p>
433 *
434 * <p>Note: In versions 1.0-2.0 of commons-math, this method
435 * incorrectly returned the array of probability generator upper
436 * bounds now returned by {@link #getGeneratorUpperBounds()}.</p>
437 *
438 * @return array of bin upper bounds
439 * @since 2.1
440 */
441 public double[] getUpperBounds() {
442 double[] binUpperBounds = new double[binCount];
443 binUpperBounds[0] = min + delta;
444 for (int i = 1; i < binCount - 1; i++) {
445 binUpperBounds[i] = binUpperBounds[i-1] + delta;
446 }
447 binUpperBounds[binCount - 1] = max;
448 return binUpperBounds;
449 }
450
451 /**
452 * <p>Returns a fresh copy of the array of upper bounds of the subintervals
453 * of [0,1] used in generating data from the empirical distribution.
454 * Subintervals correspond to bins with lengths proportional to bin counts.</p>
455 *
456 * <p>In versions 1.0-2.0 of commons-math, this array was (incorrectly) returned
457 * by {@link #getUpperBounds()}.</p>
458 *
459 * @since 2.1
460 * @return array of upper bounds of subintervals used in data generation
461 */
462 public double[] getGeneratorUpperBounds() {
463 int len = upperBounds.length;
464 double[] out = new double[len];
465 System.arraycopy(upperBounds, 0, out, 0, len);
466 return out;
467 }
468
469 /**
470 * Property indicating whether or not the distribution has been loaded.
471 *
472 * @return true if the distribution has been loaded
473 */
474 public boolean isLoaded() {
475 return loaded;
476 }
477 }