diff --git a/doc/sphinx/introduction.rst b/doc/sphinx/introduction.rst index fb84f2e61e1..e10fae6acdb 100644 --- a/doc/sphinx/introduction.rst +++ b/doc/sphinx/introduction.rst @@ -307,6 +307,7 @@ The following tutorials are available: * :file:`lennard_jones`: Modelling of a single-component and a two-component Lennard-Jones liquid. * :file:`visualization`: Using the online visualizers of |es|. +* :file:`error_analysis`: Statistical analysis of simulation results. * :file:`charged_system`: Modelling of ion condensation around a charged rod. * :file:`ferrofluid`: Modelling a colloidal suspension of magnetic particles. * :file:`lattice_boltzmann`: Simulations including hydrodynamic interactions using the lattice-Boltzmann method. diff --git a/doc/tutorials/CMakeLists.txt b/doc/tutorials/CMakeLists.txt index 5c4be0427a5..2ee1d0ce847 100644 --- a/doc/tutorials/CMakeLists.txt +++ b/doc/tutorials/CMakeLists.txt @@ -100,6 +100,7 @@ add_custom_target(tutorials_python) # Here: add new directory add_subdirectory(lennard_jones) +add_subdirectory(error_analysis) add_subdirectory(charged_system) add_subdirectory(lattice_boltzmann) add_subdirectory(raspberry_electrophoresis) diff --git a/doc/tutorials/Readme.md b/doc/tutorials/Readme.md index 55fa8add4d9..c6159ef23c0 100644 --- a/doc/tutorials/Readme.md +++ b/doc/tutorials/Readme.md @@ -12,6 +12,11 @@ physical systems. * **Simulate a simple Lennard-Jones liquid** Modelling of a single-component and a two-component Lennard-Jones liquid. [Guide](lennard_jones/lennard_jones.ipynb) +* **Error_analysis** + Statistical analysis of simulation results + Guide + [Part 1](error_analysis/error_analysis_part1.ipynb) | + [Part 2](error_analysis/error_analysis_part2.ipynb) * **Visualization** Using the online visualizers of ESPResSo. [Guide](visualization/visualization.ipynb) diff --git a/doc/tutorials/error_analysis/CMakeLists.txt b/doc/tutorials/error_analysis/CMakeLists.txt new file mode 100644 index 00000000000..0cb0cec1ff8 --- /dev/null +++ b/doc/tutorials/error_analysis/CMakeLists.txt @@ -0,0 +1,7 @@ +configure_tutorial_target(TARGET tutorial_err DEPENDS + error_analysis_part1.ipynb error_analysis_part2.ipynb) + +nb_export(TARGET tutorial_err SUFFIX "1" FILE "error_analysis_part1.ipynb" + HTML_RUN) +nb_export(TARGET tutorial_err SUFFIX "2" FILE "error_analysis_part2.ipynb" + HTML_RUN) diff --git a/doc/tutorials/error_analysis/NotesForTutor.md b/doc/tutorials/error_analysis/NotesForTutor.md new file mode 100644 index 00000000000..fafc11450ba --- /dev/null +++ b/doc/tutorials/error_analysis/NotesForTutor.md @@ -0,0 +1,18 @@ +# Part 1: Introduction and Binning Analysis + +## Learning goals + +* Give a brief overview of common measures of dispersion (standard deviation, + confidence interval, standard error of the mean) +* Teach binning analysis and apply it on well-behaved data and on data where + it doesn't converge (synthetic data is generated using the AR1 process) + +# Part 2: Autocorrelation Analysis + +## Learning goals + +* Teach autocorrelation analysis +* Integrate the ACF to determine the standard error of the mean +* Extract the autocorrelation time and use that information to decrease the + observable sampling frequency (and thus reduce the amount of data and + improve performance) and increase the simulation time for better statistics diff --git a/doc/tutorials/error_analysis/error_analysis_part1.ipynb b/doc/tutorials/error_analysis/error_analysis_part1.ipynb new file mode 100644 index 00000000000..f54cff0caaf --- /dev/null +++ b/doc/tutorials/error_analysis/error_analysis_part1.ipynb @@ -0,0 +1,520 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial: Error Estimation - Part 1 (Introduction and Binning Analysis)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of contents\n", + "0. [Data generation](#Data-generation)\n", + "1. [Introduction](#Introduction)\n", + "2. [Uncorrelated samples](#Uncorrelated-samples)\n", + "3. [Binning analysis](#Binning-analysis)\n", + "4. [References](#References)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data generation\n", + "\n", + "In this tutorial, you will learn how to estimate the accuracy of your simulation results. Because we are going to emply statistical methods, we need a fair amount of data to play with. The following code cell will generate two data sets which will be used throughout the tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams.update({'font.size': 18})\n", + "import sys\n", + "import logging\n", + "logging.basicConfig(level=logging.INFO, stream=sys.stdout)\n", + "\n", + "np.random.seed(43)\n", + "\n", + "def ar_1_process(n_samples, c, phi, eps):\n", + " '''\n", + " Generate a correlated random sequence with the AR(1) process.\n", + "\n", + " Parameters\n", + " ----------\n", + " n_samples: :obj:`int`\n", + " Sample size.\n", + " c: :obj:`float`\n", + " Constant term.\n", + " phi: :obj:`float`\n", + " Correlation magnitude.\n", + " eps: :obj:`float`\n", + " Shock magnitude.\n", + " '''\n", + " ys = np.zeros(n_samples)\n", + " if abs(phi) >= 1:\n", + " raise ValueError(\"abs(phi) must be smaller than 1.\")\n", + " # draw initial value from normal distribution with known mean and variance\n", + " ys[0] = np.random.normal(loc=c / (1 - phi), scale=np.sqrt(eps**2 / (1 - phi**2)))\n", + " for i in range(1, n_samples):\n", + " ys[i] = c + phi * ys[i - 1] + np.random.normal(loc=0., scale=eps)\n", + " return ys\n", + "\n", + "# generate simulation data using the AR(1) process\n", + "\n", + "logging.info(\"Generating data sets for the tutorial ...\")\n", + "\n", + "N_SAMPLES = 100000\n", + "\n", + "C_1 = 2.0\n", + "PHI_1 = 0.85\n", + "EPS_1 = 2.0\n", + "time_series_1 = ar_1_process(N_SAMPLES, C_1, PHI_1, EPS_1)\n", + "\n", + "C_2 = 0.05\n", + "PHI_2 = 0.999\n", + "EPS_2 = 1.0\n", + "time_series_2 = ar_1_process(N_SAMPLES, C_2, PHI_2, EPS_2)\n", + "\n", + "logging.info(\"Done\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(10, 6))\n", + "plt.title(\"The first 1000 samples of both time series\")\n", + "plt.plot(time_series_1[0:1000], label=\"time series 1\")\n", + "plt.plot(time_series_2[0:1000], label=\"time series 2\")\n", + "plt.xlabel(\"$i$\")\n", + "plt.ylabel(\"$X_i$\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "In this tutorial, you will learn how to do statistical analysis of your simulation data.\n", + "This is an important topic, because the statistics of your data determine how precise your simulation result is. Furthermore, knowing about the statistics can help you optimize your disk space usage.\n", + "\n", + "ESPResSo provides a lot of ways to take measurements of your system. Usually, you will sample a quantity many times during a simulation and in the end average over all samples. Intuitively, the simulation result will be more precise the more samples are taken during the simulation. However, this is not the whole truth. There are some things that need to be considered, which we will cover in this tutorial.\n", + "\n", + "Formally, if you determine a physical quantity by averaging over several samples, you only *approximate* the unknown, true mean value. Usually, the quantity is expected to fluctuate around its mean; therefore, you can never directly measure the mean. You are bound to take repeated measurements and in the end average over all samples (a finite number). In your report, you will present this average as your result. Additionally, you should express the precision of your measurements to give a proper meaning to your result. And this is where things get more involved.\n", + "\n", + "There are several different ways to express the precision of your measurements. We will begin by briefly discussing what they are and what their differences are. After that, we will continue with the *standard error of the mean* as a viable option to be presented in your simulation results.\n", + "\n", + "### Standard deviation\n", + "The standard deviation is a measure for how much individual samples are expected to deviate from the mean. We want to use precise terminology, and therefore need to state that, in fact, we cannot directly measure the standard deviation but only estimate it. A commonly used estimator for the standard deviation is\n", + "\n", + "$\n", + "\\begin{align}\n", + " \\hat{\\sigma} = \\sqrt{\\frac{1}{N-1.5}\\sum_{i=1}^{N}(X_i-\\overline{X})^2}\\tag{1}\n", + "\\end{align}\n", + "$\n", + "\n", + "where $\\hat{\\sigma}$ is the estimator of the standard deviation $\\sigma$, $N$ the number of samples, $X_i$ the individual samples and $\\overline{X}$ their mean. This estimator somewhat resembles the \"square root of the variance\". The curious $-1.5$ in the denominator is a necessary correction to make the estimator less biased (for further reading, see [1]).\n", + "\n", + "### Standard error of the mean\n", + "The standard error of the mean (often abbreviated as SEM, or $s$, and its estimator is designated $\\hat{\\sigma}_\\overline{X}$) describes how much the mean value of your sample is expected to deviate from the true mean value $\\mu$. Imagine repeating the whole simulation over and over again, taking $N$ samples every time and averaging over them. The SEM quantifies how much those averages will fluctuate around the true mean $\\mu$. In fact, it is defined as the standard deviation of the averages.\n", + "\n", + "At first glance, it might seem to be very expensive to compute the SEM, because one would have to repeat the whole simulation many times. However, under the right circumstances, the SEM can be estimated from *a single series* of $N$ measurements. We will discuss how this can be done.\n", + "\n", + "### Confidence interval\n", + "A confidence interval (CI) specifies a range of numbers within which the unknown true mean value $\\mu$ lies with a certain probability $1-\\alpha$. A common confidence level is $1-\\alpha=95~\\%$. A $95~\\%$ CI would contain the true value $\\mu$ with probability $95~\\%$. Care must be taken interpreting the CI, since the lower and upper bound of a CI are themselves random variables. Just as a simulation run drafts samples from the overall ensemble, determining a CI from a simulation run is drafting a CI from all possible CIs. When the upper and lower bound of a CI have been calculated, this range either contains the true value or not, so there no longer is a probability attached to it. However, for repeated simulations with subsequent computation of the corresponding CIs, on average $95~\\%$ of CIs will contain the true value, while $5~\\%$ won't.\n", + "\n", + "If the samples are normally distributed and the SEM is known, the upper and lower bounds of the $95~\\%$ CI are $\\overline{X} \\pm 1.96 \\, \\hat{\\sigma}_\\overline{X}$.\n", + "\n", + "### Interquartile range\n", + "The interquartile range denotes the range, within which the central $50~\\%$ of all samples lie, if one were to order them by their size. This leaves one quarter of all samples lying below the interquartile range, and another quarter of all samples above it.\n", + "\n", + "### Now – what do we use?\n", + "We are interested in the precision of our overall, averaged, simulation result, and not in the precision of the individual samples. Those are expected to fluctuate, and in many cases, those fluctuations are uninteresting for the end result. Out of the options presented above, the SEM and the CI are the only ones doing this requirement justice. Since they are related, the question boils down to how to compute the SEM, which will be the topic of the rest of this tutorial." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uncorrelated samples\n", + "How the SEM can be computed depends on the data itself. For uncorrelated samples, it is nearly trivial:\n", + "\n", + "$\n", + "\\begin{align}\n", + " \\hat\\sigma_\\overline{X} = \\frac{\\hat\\sigma}{\\sqrt{N}}\\tag{2}\n", + "\\end{align}\n", + "$\n", + "\n", + "where $\\hat\\sigma_\\overline{X}$ is the estimated SEM, $\\hat\\sigma$ is the estimated standard deviation (see eq. 1) and $N$ is the number of samples. But what does it mean for samples to be uncorrelated?\n", + "\n", + "An example for uncorrelated samples would be the rolling of a dice. The outcome of each trial is completely independent to the previous trials. We might guess any number from 1 to 6, regardless of what has been the last result. The same could be true if we ran an experiment many times independently from one another and measured a quantity each time. By looking at one experimental value, we would'nt be able to predict the next one. The best guess would be simply the mean value of the entire series. In the case of rolling a dice, correlations could for example be observed if it was more probable to obtain the same result as in the previous dice roll rather than another result.\n", + "\n", + "Usually, when you run a molecular dynamics simulation, the particles will only move by a tiny amount during a time step. Consequently, most observables also change only by a small amount during a time step and it is, therefore, more probable to obtain a similar result rather than a completely different result. If we were to sample an observable in every time step, we would get a lot of samples with very similar values. It is said that the samples are *correlated*. Only if we wait for a sufficiently long time, the system will eventually have evolved to a completely different configuration, and we can expect the observable to assume a truly independent, *uncorrelated* value.\n", + "\n", + "It is often easy to see when samples are correlated. Execute the code cell below for an example, where a small part of `time_series_1` is plotted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(10, 6))\n", + "plt.plot(time_series_1[1000:1050], \"x\")\n", + "fig.axes[0].margins(y=0.1)\n", + "plt.xlabel(\"$i$\")\n", + "plt.ylabel(\"$X_i$\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One can clearly see that each sample lies in the vicinity of the previous one.\n", + "\n", + "Below is an example for almost completely uncorrelated samples. The data points are taken from the same time series as in the previous example, but this time they are chosen with large gaps in between (every 800th sample is used). These samples appear to fluctuate a lot more randomly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(10, 6))\n", + "plt.plot(np.arange(2000, 42000, 800), time_series_1[2000:42000:800], \"x\")\n", + "fig.axes[0].margins(y=0.1)\n", + "plt.xlabel(\"$i$\")\n", + "plt.ylabel(\"$X_i$\")\n", + "fig.axes[0].xaxis.set_major_locator(plt.MultipleLocator(base=8000))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, you should not trust your eye in deciding whether or not a time series is correlated. In fact, when running molecular dynamics simulations, your best guess is to always assume that samples are correlated, and that you should use one of the following techniques for statistical analysis, and rather not just use equation (2)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Binning analysis\n", + "\n", + "Binning analysis is a straightforward method to calculate the SEM for correlated data. A time series of measurements of $N$ samples is divided into $N_\\mathrm{B}$ equally long blocks called bins. If $N$ is not an integer multiple of $N_\\mathrm{B}$, some data must be discarded to achieve this. The samples in every bin are averaged, giving the bin averages $\\overline{X}_i$. It is important that the bin size $N/N_\\mathrm{B}$ is significantly larger than the correlation time. Otherwise, binning analysis will yield the wrong SEM. \n", + "\n", + "Once we have computed the bin averages $\\overline{X}_i$, getting the SEM is straightforward: we can simply treat $\\overline{X}_i$ as an uncorrelated time series. In other words, we can compute the SEM by using equation (1) and (2)!\n", + "\n", + "Let's implement this." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "BIN_SIZE = 2000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden", + "solution2_first": true + }, + "source": [ + "#### Exercise\n", + "* Determine the maximally possible number of bins of size ```BIN_SIZE``` with the data in ```time_series_1```, and store it in a variable ```N_BINS```.\n", + "* Create a numpy array called ```bin_avgs``` of length ```N_BINS```.\n", + "* Compute the bin averages of ```time_series_1``` and store them in ```bin_avgs```." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden" + }, + "source": [ + "```python\n", + "N_BINS = N_SAMPLES // BIN_SIZE\n", + "bin_avgs = np.zeros(N_BINS)\n", + "for i in range(N_BINS):\n", + " bin_avgs[i] = np.average(time_series_1[i * BIN_SIZE:(i + 1) * BIN_SIZE])\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden", + "solution2_first": true + }, + "source": [ + "#### Exercise\n", + "Compute the average of all bin averages and store it in ```avg```. This is the overall average, our best guess for the measured quantity. Furthermore, compute the standard error of the mean using equations (1) and (2) from the values in ```bin_avgs``` and store it in ```sem```." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden" + }, + "source": [ + "```python\n", + "avg = np.average(bin_avgs)\n", + "sem = np.sqrt(np.sum((bin_avgs - avg)**2) / (N_BINS - 1.5) / N_BINS)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Best guess for measured quantity: {avg:.3f}\")\n", + "print(f\"Standard error of the mean: {sem:.3f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we already have an estimate on how precise our simulation result is. But how do we know if we chose the appropriate bin size? The answer is, we can perform binning analysis for many different bin sizes and check when the SEM converges. For that we would like to define a function that does the binning analysis in one go." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden", + "solution2_first": true + }, + "source": [ + "#### Exercise\n", + "Define a function called ```do_binning_analysis``` that takes as arguments ```data``` (a numpy array containing the samples) and ```bin_size``` and returns the estimated SEM. You can reuse your code from the previous exercises and adapt it to be part of the function." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden" + }, + "source": [ + "```python\n", + "def do_binning_analysis(data, bin_size):\n", + " n_samples = len(data)\n", + " n_bins = n_samples // bin_size\n", + " bin_avgs = np.mean(data[:n_bins * bin_size].reshape((n_bins, -1)), axis=1)\n", + " return np.std(bin_avgs, ddof=1.5) / np.sqrt(n_bins)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden", + "solution2_first": true + }, + "source": [ + "#### Exercise\n", + "Now take the data in ```time_series_1``` and perform binning analysis for bin sizes from 3 up to 5000 and plot the estimated SEMs against the bin size with logarithmic x axis. Your SEM estimates should be stored in a numpy array called ```sems```." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden" + }, + "source": [ + "```python\n", + "sizes = np.arange(3, 5001, dtype=int)\n", + "sems = np.zeros(5001 - 3, dtype=float)\n", + "for s in range(len(sizes)):\n", + " sems[s] = do_binning_analysis(time_series_1, sizes[s])\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(sizes, sems, \"x\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"$N_B$\")\n", + "plt.ylabel(\"SEM\")\n", + "plt.show()\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should see that the series converges to a value between 0.04 and 0.05, before transitioning into a noisy tail. The tail becomes increasingly noisy, because as the block size increases, the number of blocks decreases, thus resulting in worse statistics.\n", + "\n", + "To extract the correct SEM from this plot, we can fit an exponential function to the first part of the data, that doesn't suffer from too much noise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.optimize import curve_fit\n", + "\n", + "# only fit to the first couple of SEMs\n", + "CUTOFF = 600\n", + "\n", + "# sizes of the corresponding bins\n", + "sizes_subset = np.arange(3, 3 + CUTOFF, dtype=int)\n", + "\n", + "def fit_fn(x, a, b, c):\n", + " return -np.exp(-a * x) * b + c\n", + "\n", + "fit_params, _ = curve_fit(fit_fn, sizes_subset, sems[:CUTOFF], (0.05, 1, 0.5))\n", + "\n", + "fit_sems = fit_fn(sizes, *fit_params)\n", + "\n", + "# compute analytical solutions for AR(1) process\n", + "AN_SIGMA_1 = np.sqrt(EPS_1 ** 2 / (1 - PHI_1 ** 2))\n", + "AN_TAU_EXP_1 = -1 / np.log(PHI_1)\n", + "AN_SEM_1 = np.sqrt(2 * AN_SIGMA_1 ** 2 * AN_TAU_EXP_1 / N_SAMPLES)\n", + "\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(sizes, sems, \"x\", label=\"binning analysis\")\n", + "plt.plot(sizes[(0, -1),], np.repeat(AN_SEM_1, 2), \"-.\", label=\"analytical solution\")\n", + "plt.plot(sizes, fit_sems, \"-\", label=\"fit\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"$N_B$\")\n", + "plt.ylabel(\"SEM\")\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "print(f\"Final Standard Error of the Mean: {fit_params[2]:.4f}\")\n", + "print(f\"Analytical Standard Error of the Mean: {AN_SEM_1:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even though the fit is not perfect, it suffices to give us the position of the asymptote, which is the final estimate for the standard error of the mean. You can see that binning analysis, in fact, managed to estimate the SEM very precisely compared to the analytical solution. This illustrates that most of the time, binning analysis will give you a very reasonable estimate for the SEM, and in fact, is often used in practice because of its simplicity.\n", + "\n", + "However, in some cases, the statistics of your system can be quite challenging. Remember that in real applications, there won't be an analytical solution for the SEM. Therefore, you need to rely entirely on the statistical analysis. It is important to view the statistical analysis critically to decide whether the statistical analysis is trustworthy or not. To illustrate this, let's have a look at the binning analysis of the other time series that was generated at the start of the tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "sizes = np.arange(3, 5001, dtype=int)\n", + "sems = np.zeros(5001 - 3, dtype=float)\n", + "for s in range(len(sizes)):\n", + " sems[s] = do_binning_analysis(time_series_2, sizes[s])\n", + "\n", + "# compute analytical solutions for AR(1) process\n", + "AN_SIGMA_2 = np.sqrt(EPS_2 ** 2 / (1 - PHI_2 ** 2))\n", + "AN_TAU_EXP_2 = -1 / np.log(PHI_2)\n", + "AN_SEM_2 = np.sqrt(2 * AN_SIGMA_2 ** 2 * AN_TAU_EXP_2 / N_SAMPLES)\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.plot(sizes, sems, \"x\", label=\"binning analysis\")\n", + "plt.plot(sizes[(0, -1),], np.repeat(AN_SEM_2, 2), \"-.\", label=\"analytical solution\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"$N_B$\")\n", + "plt.ylabel(\"SEM\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Even though we have the exact same number of samples, we cannot see the binning analysis converge. The SEM simply cannot be determined. Usually, this is due to very long correlations, and can only be compensated by simulating for a longer time.\n", + "\n", + "You may notice that the binning analysis gets handwavey and uncertain when the statistics are bad. We could still – despite the noise – fit a function to the above data points and come up with a value for the SEM, but it will most certainly be quite inaccurate and cannot be trusted.\n", + "For such difficult cases, there is a more rigorous approach to do statistical analysis: *Auto-covariance analysis* can reveal whether or not your data has sufficient statistics to determine the SEM. It will be discussed in the second part of this tutorial." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References\n", + "[1] Wikipedia: Unbiased estimation of standard deviation" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/doc/tutorials/error_analysis/error_analysis_part2.ipynb b/doc/tutorials/error_analysis/error_analysis_part2.ipynb new file mode 100644 index 00000000000..2db9519be6b --- /dev/null +++ b/doc/tutorials/error_analysis/error_analysis_part2.ipynb @@ -0,0 +1,581 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial: Error Estimation - Part 2 (Autocorrelation Analysis)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Table of contents\n", + "0. [Data generation](#Data-generation)\n", + "1. [Introduction](#Introduction)\n", + "2. [Computing the auto-covariance function](#Computing-the-auto-covariance-function)\n", + "3. [Autocorrelation time](#Autocorrelation-time)\n", + "4. [References](#References)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data generation\n", + "\n", + "This first code cell will provide us with the same two data sets as in the previous part of this tutorial. We will use them to get familiar with the autocorrelation analysis method of error estimation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "plt.rcParams.update({'font.size': 18})\n", + "import sys\n", + "import logging\n", + "logging.basicConfig(level=logging.INFO, stream=sys.stdout)\n", + "\n", + "np.random.seed(43)\n", + "\n", + "def ar_1_process(n_samples, c, phi, eps):\n", + " '''\n", + " Generate a correlated random sequence with the AR(1) process.\n", + "\n", + " Parameters\n", + " ----------\n", + " n_samples: :obj:`int`\n", + " Sample size.\n", + " c: :obj:`float`\n", + " Constant term.\n", + " phi: :obj:`float`\n", + " Correlation magnitude.\n", + " eps: :obj:`float`\n", + " Shock magnitude.\n", + " '''\n", + " ys = np.zeros(n_samples)\n", + " if abs(phi) >= 1:\n", + " raise ValueError(\"abs(phi) must be smaller than 1.\")\n", + " # draw initial value from normal distribution with known mean and variance\n", + " ys[0] = np.random.normal(loc=c / (1 - phi), scale=np.sqrt(eps**2 / (1 - phi**2)))\n", + " for i in range(1, n_samples):\n", + " ys[i] = c + phi * ys[i - 1] + np.random.normal(loc=0., scale=eps)\n", + " return ys\n", + "\n", + "# generate simulation data using the AR(1) process\n", + "\n", + "logging.info(\"Generating data sets for the tutorial ...\")\n", + "\n", + "N_SAMPLES = 100000\n", + "\n", + "C_1 = 2.0\n", + "PHI_1 = 0.85\n", + "EPS_1 = 2.0\n", + "time_series_1 = ar_1_process(N_SAMPLES, C_1, PHI_1, EPS_1)\n", + "\n", + "C_2 = 0.05\n", + "PHI_2 = 0.999\n", + "EPS_2 = 1.0\n", + "time_series_2 = ar_1_process(N_SAMPLES, C_2, PHI_2, EPS_2)\n", + "\n", + "logging.info(\"Done\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(10, 6))\n", + "plt.title(\"The first 1000 samples of both time series\")\n", + "plt.plot(time_series_1[0:1000], label=\"time series 1\")\n", + "plt.plot(time_series_2[0:1000], label=\"time series 2\")\n", + "plt.xlabel(\"$i$\")\n", + "plt.ylabel(\"$X_i$\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "In the first part of the error analysis tutorial we have introduced the binning analysis, an easy and common tool for error estimation. However, we have seen that it failed to deliver an estimate for our second data set. In this tutorial, we will get to know a different method: the *autocorrelation analysis*, sometimes also called *auto covariance analysis*. It not only delivers an estimate for the standard error of the mean (SEM), but also information on the correlations and the optimal sampling rate.\n", + "\n", + "Before we start computing anything, we will give a brief overview over the relevant quantities and how they relate to each other. This outlines how one would go about computing these quantities. The end goal of this process is to define an estimator for the standard error of the mean $\\sigma_\\overline{X}$. And if the data allows for it, it can be calculated. If it fails, autocorrelation analysis provides more insight into the causes of the failure than the binning analysis from the first part of this tutorial. Albeit being more involved, it can provide a valuable tool for systems with difficult statistics.\n", + "\n", + "Let us begin the theory by defining the auto-covariance function $R^{XX}(\\tau)$ of an observable $X$, at lag time $\\tau$:\n", + "\n", + "$\n", + "\\begin{align}\n", + " R^{XX}(\\tau) &\\equiv \\langle (X(t)-\\langle X \\rangle)(X(t+\\tau)-\\langle X \\rangle) \\rangle \\\\\n", + " &= \\langle X(t) X(t+\\tau) \\rangle - \\langle X \\rangle^2, \\tag{1}\n", + "\\end{align}\n", + "$\n", + "\n", + "where $\\langle \\dots \\rangle$ denotes the ensemble average of the expression inside the angled brackets — e.g. $\\langle X \\rangle$ is the true mean value of the observable $X$. In the previous part we have established an understanding of correlations as being the \"similarity\" of successive samples. This is an intuitive but inaccurate understanding. The auto-covariance function provides a means to measure and quantify correlation.\n", + "\n", + "Computing the auto-covariance for $\\tau=0$ yields the variance $\\sigma=\\langle X^2 \\rangle - \\langle X \\rangle^2$. Normalizing the auto-covariance function by the variance yields the *autocorrelation function* (ACF)\n", + "\n", + "$\n", + "\\begin{align}\n", + " A^{XX}(\\tau) = \\frac{R^{XX}(\\tau)}{R^{XX}(0)} = \\frac{\\langle X(t) X(t+\\tau) \\rangle - \\langle X \\rangle^2}{\\langle X^2 \\rangle - \\langle X \\rangle^2}. \\tag{2}\n", + "\\end{align}\n", + "$\n", + "\n", + "The ACF can be used to estimate the correlation time $\\tau_X$. Often, this can be simply done by fitting an exponential function to $A^{XX}$, from which we extract $\\tau_{X, \\mathrm{exp}}$ as the inverse decay rate. However, the ACF doesn't necessarily assume the shape of an exponential. That is when another quantity, called the *integrated autocorrelation time*\n", + "\n", + "$\n", + "\\begin{align}\n", + " \\tau_{X, \\mathrm{int}} \\equiv \\int_0^\\infty A^{XX}(\\tau) \\mathrm{d}\\tau \\tag{3}\n", + "\\end{align}\n", + "$\n", + "\n", + "comes into play. Those two correlation times $\\tau_{X, \\mathrm{int}}$ and $\\tau_{X, \\mathrm{exp}}$ are identical for exponential ACFs, but if the ACF isn't exponential, $\\tau_{X, \\mathrm{int}}$ is the only meaningful quantity. It is related to the effective number of samples\n", + "\n", + "$\n", + "\\begin{align}\n", + " N_\\mathrm{eff} = \\frac{N}{2 \\tau_{X, \\mathrm{int}}} \\tag{4}\n", + "\\end{align}\n", + "$\n", + "\n", + "and also to the standard error of the mean (SEM)\n", + "\n", + "$\n", + "\\begin{align}\n", + " \\sigma_\\overline{X} = \\sqrt{\\frac{2 \\sigma_X^2 \\tau_{X, \\mathrm{int}}}{N}} = \\sqrt{\\frac{\\sigma_X^2}{N_\\mathrm{eff}}}. \\tag{5}\n", + "\\end{align}\n", + "$\n", + "\n", + "where $\\sigma_X^2 = \\langle X^2 \\rangle-\\langle X \\rangle ^2$ is the variance of the observable $X$. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Computing the auto-covariance function\n", + "\n", + "Equations (1) and (2) involve an infinite, continuous time series $X(t)$. In the simulation world however, we work with finite, discrete time series. These limitations dictate how we can estimate the true (unknown) autocorrelation function. For a finite, time-discrete set of samples $X_i$, a commonly used estimator is the following expression\n", + "\n", + "$\n", + "\\begin{align}\n", + " \\hat{R}^{XX}_j = \\frac{1}{N} \\sum^{N-|j|}_{i=1}(X_i-\\overline{X})(X_{i+|j|}-\\overline{X}), \\tag{6}\n", + "\\end{align}\n", + "$\n", + "\n", + "where $N$ is the total number of samples, and $\\overline{X}=\\frac{1}{N}\\sum_{i=1}^N X_i$ is the average of all samples. This estimates the auto-covariance function at lag time $\\tau=j\\Delta t$ where $\\Delta t$ is the time separation between samples.\n", + "\n", + "Before we continue, we want to notify the reader about a few subtleties regarding this estimator:\n", + "* Ideally, we would use $\\langle X \\rangle$ instead of $\\overline{X}$, since the latter is only an estimate of the former. In most cases we don't know $\\langle X \\rangle$, thus we introduce a small unknown bias by using the estimated mean $\\overline{X}$ instead.\n", + "* Actually, the sum does not contain $N$ terms, but $N-|j|$ terms. Consequently, we should divide the whole sum by $N-|j|$ and not by $N$. In fact, this approach yields a different estimator to the auto-covariance function (the so-called *unbiased* estimator). However, for large $N$ and small $j$, both estimators yield similar results. This is why the simpler $N$ is commonly used anyway." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden", + "solution2_first": true + }, + "source": [ + "#### Exercise\n", + "Compute the auto-covariance function of the data in `time_series_1` using the estimator in equation (6) and store it into a numpy array called `autocov`. Compute it for all $j$ from `0` up to `999`. Plot it against $j$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden" + }, + "source": [ + "```python\n", + "# naive Python solution\n", + "autocov = np.zeros(300)\n", + "avg = np.average(time_series_1)\n", + "for j in range(300):\n", + " temp = 0.\n", + " for i in range(N_SAMPLES - j):\n", + " temp += (time_series_1[i] - avg) * (time_series_1[i + j] - avg)\n", + " autocov[j] = temp / N_SAMPLES\n", + "\n", + "fig = plt.figure(figsize=(10, 6))\n", + "plt.plot(autocov)\n", + "plt.xlabel(\"lag time $j$\")\n", + "plt.ylabel(\"$\\hat{R}^{XX}_j$\")\n", + "plt.show()\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Depending on your implementation, this computation might have taken a significant amount of time (up to a couple tens of seconds). When doing a lot of these computations, using highly optimized routines for numerics can save a lot of time. The following example shows how to utilize the common Numpy package to do the job quicker." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Numpy solution\n", + "time_series_1_centered = time_series_1 - np.average(time_series_1)\n", + "autocov = np.empty(1000)\n", + "\n", + "for j in range(1000):\n", + " autocov[j] = np.dot(time_series_1_centered[:N_SAMPLES - j], time_series_1_centered[j:])\n", + "autocov /= N_SAMPLES\n", + "\n", + "fig = plt.figure(figsize=(10, 6))\n", + "plt.gca().axhline(0, color=\"gray\", linewidth=1)\n", + "plt.plot(autocov)\n", + "plt.xlabel(\"lag time $j$\")\n", + "plt.ylabel(\"$\\hat{R}^{XX}_j$\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the auto-covariance function starts at a high value and decreases quickly into a long noisy tail which fluctuates around zero. The high values at short lag times indicate that there are strong correlations at short time scales, as expected. However, even though the tail looks uninteresting, it can bear important information about the statistics of your data. Small systematic deviations from 0 in the tail can be a hint that long-term correlations exist in your system. On the other hand, if there is no sign of a systematic deviation from 0 in the tail, this usually means that the correlation is decaying well within the simulation time, and that the statistics are good enough to estimate an error. In the above example, the correlation quickly decays to zero. Despite the noise in the tail, the statistics seem very reasonable." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Autocorrelation time\n", + "\n", + "Continuing our example, we can zoom into the first part of the auto-covariance function (using a log scale). We see that it indeed does have similarities with an exponential decay curve. In general, it isn't an exponential, but often can be approximated using one. If it matches reasonably well, the inverted prefactor in the exponential can be directly used as the *correlation time*, which is a measure for how many sampling intervals it takes for correlations to decay. Execute the following code cell for an illustration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from scipy.optimize import curve_fit\n", + "\n", + "def exp_fnc(x, a, b):\n", + " return a * np.exp(-x / b)\n", + "\n", + "N_MAX = 1000\n", + "j = np.arange(1, N_MAX)\n", + "j_log = np.logspace(0, 3, 100)\n", + "popt, pcov = curve_fit(exp_fnc, j, autocov[1:N_MAX], p0=[15, 10])\n", + "\n", + "# compute analytical ACF of AR(1) process\n", + "AN_SIGMA_1 = np.sqrt(EPS_1 ** 2 / (1 - PHI_1 ** 2))\n", + "AN_TAU_EXP_1 = -1 / np.log(PHI_1)\n", + "an_acf_1 = AN_SIGMA_1**2 * np.exp(-j / AN_TAU_EXP_1)\n", + "\n", + "fig = plt.figure(figsize=(10, 6))\n", + "plt.plot(j, autocov[1:N_MAX], \"x\", label=\"numerical ACF\")\n", + "plt.plot(j, an_acf_1, \"-.\", linewidth=3, label=\"analytical ACF\")\n", + "plt.plot(j_log, exp_fnc(j_log, popt[0], popt[1]), label=\"exponential fit\")\n", + "plt.xlim((1, N_MAX))\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(\"lag time $j$\")\n", + "plt.ylabel(\"$\\hat{R}^{XX}_j$\")\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "print(f\"Exponential autocorrelation time: {popt[1]:.2f} sampling intervals\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the auto-covariance function is very well matched with an exponential, this analysis already gives us a reasonable estimate of the autocorrelation time. Here we have the luxury to have an analytical ACF at hand which describes the statistics of the simple AR(1) process, which generated our simulation data. It is in fact exponential and agrees very well with the numerical ACF. In practice, however, you will neither know an analytical ACF, nor know if the ACF is exponential, at all. In many systems, the ACF is more or less exponential, but this is not necessarily the case.\n", + "\n", + "For the sake of completeness, we also want to compute the integrated correlation time. This technique must be applied when the ACF is not exponential. For that purpose, we first need to normalize the auto-covariance function in order to get the autocorrelation function (as opposed to auto-covariance function), and then integrate over it.\n", + "\n", + "The integration in equation (3) is again approximated as a discrete sum over the first $j_\\mathrm{max}$ values of the ACF (except $\\hat{A}^{XX}_0$, which is always 1):\n", + "\n", + "$\n", + "\\begin{align}\n", + " \\hat{\\tau}_{X, \\mathrm{int}} = \\frac{1}{2} + \\sum_{j=1}^{j_\\mathrm{max}} \\hat{A}^{XX}_j \\tag{7}\n", + "\\end{align}\n", + "$\n", + "\n", + "where $\\hat{A}^{XX}_j = \\hat{R}^{XX}_j / \\hat{R}^{XX}_0$ is the estimated ACF. The sum is evaluated up to a maximum number of terms $j_\\mathrm{max}$. This maximum number of terms is a crucial parameter. In the following code cell, $\\hat{\\tau}_{X, \\mathrm{int}}$ is plotted over $j_\\mathrm{max}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# compute the ACF\n", + "acf = autocov / autocov[0]\n", + "\n", + "# integrate the ACF (suffix _v for vectors)\n", + "j_max_v = np.arange(1000)\n", + "tau_int_v = np.zeros(1000)\n", + "for j_max in j_max_v:\n", + " tau_int_v[j_max] = 0.5 + np.sum(acf[1:j_max + 1])\n", + "\n", + "# plot\n", + "fig = plt.figure(figsize=(10, 6))\n", + "plt.plot(j_max_v[1:], tau_int_v[1:], label=\"numerical summing\")\n", + "plt.plot(j_max_v[(1, -1),], np.repeat(AN_TAU_EXP_1, 2), \"-.\", label=\"analytical\")\n", + "plt.xscale(\"log\")\n", + "plt.xlabel(r\"sum length $j_\\mathrm{max}$\")\n", + "plt.ylabel(r\"$\\hat{\\tau}_{X, \\mathrm{int}}$\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this plot, we have the analytical solution at hand, which is a luxury not present in real applications. For the analysis, we therefore need to act as if there was no analytic solution:\n", + "\n", + "We see that the integrated autocorrelation time seems to quickly reach a plateau at a $j_\\mathrm{max}$ of around 20. Further summation over the noisy tail of the ACF results in a random-walky behaviour. And for even larger $j_\\mathrm{max}$, the small unknown bias of the ACF starts to accumulate, which is clearly unwanted. Thus, we have to find a good point to cut off the sum. There are several ways to determine a reasonable value for $j_\\mathrm{max}$. Here, we demonstrate the one by A. Sokal [1], who states that it performs well if there are at least 1000 samples in the time series. We take the smallest $j_\\mathrm{max}$, for which the following inequality holds:\n", + "\n", + "$\n", + "j_\\mathrm{max} \\geq C \\times \\hat{\\tau}_{X, \\mathrm{int}}(j_\\mathrm{max}) \\tag{8}\n", + "$\n", + "\n", + "where $C$ is a constant of about 5, or higher if convergence of $\\hat{\\tau}_{X, \\mathrm{int}}$ is slower than an exponential (up to $C=10$). In the following code cell, we plot the left side against the right side, and determine $j_\\mathrm{max}$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "C = 5.0\n", + "\n", + "# determine j_max\n", + "j_max = 0\n", + "while j_max < C * tau_int_v[j_max]:\n", + " j_max += 1\n", + "\n", + "\n", + "# plot\n", + "fig = plt.figure(figsize=(10, 6))\n", + "plt.plot(j_max_v[1:], C * tau_int_v[1:])\n", + "plt.plot(j_max_v[1:], j_max_v[1:])\n", + "plt.plot([j_max], [C * tau_int_v[j_max]], \"ro\")\n", + "plt.xscale(\"log\")\n", + "plt.ylim((0, 50))\n", + "plt.xlabel(r\"sum length $j_\\mathrm{max}$\")\n", + "plt.ylabel(r\"$C \\times \\hat{\\tau}_{X, \\mathrm{int}}$\")\n", + "plt.show()\n", + "\n", + "print(f\"j_max = {j_max}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using this value of $j_\\mathrm{max}$, we can calculate the integrated autocorrelation time $\\hat{\\tau}_{X, \\mathrm{int}}$ and estimate the SEM with equation (5)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tau_int = tau_int_v[j_max]\n", + "print(f\"Integrated autocorrelation time: {tau_int:.2f} time steps\\n\")\n", + "\n", + "N_eff = N_SAMPLES / (2 * tau_int)\n", + "print(f\"Original number of samples: {N_SAMPLES}\")\n", + "print(f\"Effective number of samples: {N_eff:.1f}\")\n", + "print(f\"Ratio: {N_eff / N_SAMPLES:.3f}\\n\")\n", + "\n", + "sem = np.sqrt(autocov[0] / N_eff)\n", + "print(f\"Standard error of the mean: {sem:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden", + "solution2_first": true + }, + "source": [ + "#### Exercise\n", + "* Write a function called `autocorrelation_analysis`, which takes as arguments\n", + " * `data` (a numpy array containing the time series to be analyzed), \n", + " * `C` (which is the criterion to find $j_\\mathrm{max}$) and \n", + " * `window` (an integer that defines how much of the auto-covariance function is computed during the analysis).\n", + " \n", + " The function shall return the SEM and logging.info out:\n", + " * mean\n", + " * SEM\n", + " * integrated autocorrelation time\n", + " * effective number of samples. \n", + " \n", + " It should also make a plot of the autocorrelation function and the integrated ACF. You can adapt the other examples and solutions in this notebook for this function.\n", + " \n", + "* Use this function to analyze `time_series_2`.\n", + "\n", + "This function can serve as a template for the analysis of your own simulation data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden" + }, + "source": [ + "```python\n", + "def autocorrelation_analysis(data, C, window):\n", + " # initial processing\n", + " data_size = len(data)\n", + " avg = np.average(data)\n", + " data_centered = data - avg\n", + "\n", + " # auto-covariance function\n", + " autocov = np.empty(window)\n", + " for j in range(window):\n", + " autocov[j] = np.dot(data_centered[:data_size - j], data_centered[j:])\n", + " autocov /= data_size\n", + "\n", + " # autocorrelation function\n", + " acf = autocov / autocov[0]\n", + "\n", + " # integrate autocorrelation function\n", + " j_max_v = np.arange(window)\n", + " tau_int_v = np.zeros(window)\n", + " for j_max in j_max_v:\n", + " tau_int_v[j_max] = 0.5 + np.sum(acf[1:j_max + 1])\n", + "\n", + " # find j_max\n", + " j_max = 0\n", + " while j_max < C * tau_int_v[j_max]:\n", + " j_max += 1\n", + "\n", + " # wrap it up\n", + " tau_int = tau_int_v[j_max]\n", + " N_eff = data_size / (2 * tau_int)\n", + " sem = np.sqrt(autocov[0] / N_eff)\n", + "\n", + " # create ACF plot\n", + " fig = plt.figure(figsize=(10, 6))\n", + " plt.gca().axhline(0, color=\"gray\",linewidth=1)\n", + " plt.plot(acf)\n", + " plt.xlabel(\"lag time $j$\")\n", + " plt.ylabel(\"$\\hat{R}^{XX}_j$\")\n", + " plt.show()\n", + "\n", + " # create integrated ACF plot\n", + " fig = plt.figure(figsize=(10, 6))\n", + " plt.plot(j_max_v[1:], C * tau_int_v[1:])\n", + " plt.ylim(plt.gca().get_ylim()) # explicitly keep the limits of the first plot\n", + " plt.plot(j_max_v[1:], j_max_v[1:])\n", + " plt.plot([j_max], [C * tau_int_v[j_max]], \"ro\")\n", + " plt.xscale(\"log\")\n", + " plt.xlabel(r\"sum length $j_\\mathrm{max}$\")\n", + " plt.ylabel(r\"$C \\times \\hat{\\tau}_{X, \\mathrm{int}}$\")\n", + " plt.title(\"\")\n", + " plt.show()\n", + "\n", + " # print out stuff\n", + " print(f\"Mean value: {avg:.4f}\")\n", + " print(f\"Standard error of the mean: {sem:.4f}\")\n", + " print(f\"Integrated autocorrelation time: {tau_int:.2f} time steps\")\n", + " print(f\"Effective number of samples: {N_eff:.1f}\")\n", + "\n", + " return sem\n", + "\n", + "\n", + "sem_2 = autocorrelation_analysis(time_series_2, 5, 20000)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden", + "solution2_first": true + }, + "source": [ + "#### Exercise\n", + "Interpret the results of the analysis of `time_series_2`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "solution2": "hidden" + }, + "source": [ + "**Interpretation of the analysis**\n", + "\n", + "Even though the autocorrelation analysis spits out a number for the SEM, it cannot be trusted. The ACF has a lot of noise in its tail which lets the integrated ACF become very \"random walky\" and therefore unreliable. This means that the ACF has not properly decayed to zero. The only possibility to get better statistics is to simulate for a longer time. Since the autocorrelation time is very long, it is sufficient to store a lot less samples during simulation. The sampling interval could be chosen to be 100 times larger and still capture the statistics sufficiently well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References\n", + "[1] A. Sokal. Monte Carlo Methods in Statistical Mechanics: Foundations and New Algorithms Note to the Reader , 1996" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/testsuite/scripts/tutorials/CMakeLists.txt b/testsuite/scripts/tutorials/CMakeLists.txt index 3410c839c94..cd37fb7bd77 100644 --- a/testsuite/scripts/tutorials/CMakeLists.txt +++ b/testsuite/scripts/tutorials/CMakeLists.txt @@ -25,6 +25,8 @@ add_custom_target( COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_BINARY_DIR}/doc/tutorials ${TUTORIALS_DIR} DEPENDS tutorials_python) +tutorial_test(FILE test_error_analysis_part1.py) +tutorial_test(FILE test_error_analysis_part2.py) tutorial_test(FILE test_lennard_jones.py) tutorial_test(FILE test_charged_system.py) tutorial_test(FILE test_lattice_boltzmann_part2.py) diff --git a/testsuite/scripts/tutorials/test_error_analysis_part1.py b/testsuite/scripts/tutorials/test_error_analysis_part1.py new file mode 100644 index 00000000000..648ef0fc126 --- /dev/null +++ b/testsuite/scripts/tutorials/test_error_analysis_part1.py @@ -0,0 +1,109 @@ +# Copyright (C) 2021 The ESPResSo project +# +# This file is part of ESPResSo. +# +# ESPResSo is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# ESPResSo is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest as ut +import importlib_wrapper +import numpy as np +import scipy.signal + +tutorial, skipIfMissingFeatures = importlib_wrapper.configure_and_import( + filepath="@TUTORIALS_DIR@/error_analysis/error_analysis_part1.py") + + +@skipIfMissingFeatures +class Tutorial(ut.TestCase): + + def ar_1_process(self, n, c, phi, eps): + y0 = np.random.normal(loc=c / (1 - phi), + scale=np.sqrt(eps**2 / (1 - phi**2))) + y = c + np.random.normal(loc=0.0, scale=eps, size=n - 1) + y = np.insert(y, 0, y0) + # get an AR(1) process from an ARMA(p,q) process with p=1 and q=0 + y = scipy.signal.lfilter([1.], [1., -phi], y) + return y + + def test_ar1_implementation(self): + with self.assertRaises(ValueError): + tutorial.ar_1_process(10, 1.0, 1.1, 3.0) + with self.assertRaises(ValueError): + tutorial.ar_1_process(10, 1.0, -1.1, 3.0) + + for seed in range(5): + for eps in [0.5, 1., 2.]: + for phi in [0.1, 0.8, 0.999, -0.3]: + c = eps / 2. + np.random.seed(seed) + seq = tutorial.ar_1_process(10, c, phi, eps) + np.random.seed(seed) + ref = self.ar_1_process(10, c, phi, eps) + np.testing.assert_allclose(seq, ref, atol=1e-12, rtol=0) + + def test(self): + self.assertLess(abs(tutorial.PHI_1), 1.0) + self.assertLess(abs(tutorial.PHI_2), 1.0) + + # Test manual binning analysis + ref_bin_avgs = np.mean( + tutorial.time_series_1[:tutorial.N_BINS * tutorial.BIN_SIZE].reshape((tutorial.N_BINS, -1)), axis=1) + np.testing.assert_allclose( + tutorial.bin_avgs, + ref_bin_avgs, + atol=1e-12, + rtol=0) + self.assertAlmostEqual( + tutorial.avg, + np.mean(ref_bin_avgs), + delta=1e-10) + self.assertAlmostEqual( + tutorial.sem, + np.std(ref_bin_avgs, ddof=1.5) / np.sqrt(tutorial.N_BINS), + delta=1e-10) + + # Test binning analysis function + for bin_size in [2, 10, 76, 100]: + data = np.random.random(500) + n_bins = 500 // bin_size + sem = tutorial.do_binning_analysis(data, bin_size) + ref_bin_avgs = np.mean( + data[:n_bins * bin_size].reshape((n_bins, -1)), axis=1) + ref_sem = np.std(ref_bin_avgs, ddof=1.5) / np.sqrt(n_bins) + self.assertAlmostEqual(sem, ref_sem, delta=1e-10) + + # The analytic expressions for the AR(1) process are taken from + # https://en.wikipedia.org/wiki/Autoregressive_model#Example:_An_AR(1)_process + # (accessed June 2021) + SIGMA_1 = np.sqrt(tutorial.EPS_1 ** 2 / (1 - tutorial.PHI_1 ** 2)) + TAU_EXP_1 = -1 / np.log(tutorial.PHI_1) + # The autocorrelation is exponential, thus tau_exp = tau_int, and + # therefore + SEM_1 = np.sqrt(2 * SIGMA_1 ** 2 * TAU_EXP_1 / tutorial.N_SAMPLES) + + self.assertAlmostEqual( + tutorial.fit_params[2], + SEM_1, + delta=0.1 * SEM_1) + self.assertAlmostEqual(tutorial.AN_SEM_1, SEM_1, delta=1e-10 * SEM_1) + + SIGMA_2 = np.sqrt(tutorial.EPS_2 ** 2 / (1 - tutorial.PHI_2 ** 2)) + TAU_EXP_2 = -1 / np.log(tutorial.PHI_2) + SEM_2 = np.sqrt(2 * SIGMA_2 ** 2 * TAU_EXP_2 / tutorial.N_SAMPLES) + + self.assertAlmostEqual(tutorial.AN_SEM_2, SEM_2, delta=1e-10 * SEM_2) + + +if __name__ == "__main__": + ut.main() diff --git a/testsuite/scripts/tutorials/test_error_analysis_part2.py b/testsuite/scripts/tutorials/test_error_analysis_part2.py new file mode 100644 index 00000000000..18d764ad872 --- /dev/null +++ b/testsuite/scripts/tutorials/test_error_analysis_part2.py @@ -0,0 +1,90 @@ +# Copyright (C) 2021 The ESPResSo project +# +# This file is part of ESPResSo. +# +# ESPResSo is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# ESPResSo is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest as ut +import importlib_wrapper +import numpy as np +import scipy.signal + +tutorial, skipIfMissingFeatures = importlib_wrapper.configure_and_import( + filepath="@TUTORIALS_DIR@/error_analysis/error_analysis_part2.py") + + +@skipIfMissingFeatures +class Tutorial(ut.TestCase): + + def ar_1_process(self, n, c, phi, eps): + y0 = np.random.normal(loc=c / (1 - phi), + scale=np.sqrt(eps**2 / (1 - phi**2))) + y = c + np.random.normal(loc=0.0, scale=eps, size=n - 1) + y = np.insert(y, 0, y0) + # get an AR(1) process from an ARMA(p,q) process with p=1 and q=0 + y = scipy.signal.lfilter([1.], [1., -phi], y) + return y + + def test_ar1_implementation(self): + with self.assertRaises(ValueError): + tutorial.ar_1_process(10, 1.0, 1.1, 3.0) + with self.assertRaises(ValueError): + tutorial.ar_1_process(10, 1.0, -1.1, 3.0) + + for seed in range(5): + for eps in [0.5, 1., 2.]: + for phi in [0.1, 0.8, 0.999, -0.3]: + c = eps / 2. + np.random.seed(seed) + seq = tutorial.ar_1_process(10, c, phi, eps) + np.random.seed(seed) + ref = self.ar_1_process(10, c, phi, eps) + np.testing.assert_allclose(seq, ref, atol=1e-12, rtol=0) + + def test(self): + self.assertLess(abs(tutorial.PHI_1), 1.0) + self.assertLess(abs(tutorial.PHI_2), 1.0) + + # The analytic expressions for the AR(1) process are taken from + # https://en.wikipedia.org/wiki/Autoregressive_model#Example:_An_AR(1)_process + # (accessed June 2021) + SIGMA_1 = np.sqrt(tutorial.EPS_1 ** 2 / (1 - tutorial.PHI_1 ** 2)) + TAU_EXP_1 = -1 / np.log(tutorial.PHI_1) + ref_acf_1 = SIGMA_1**2 * \ + np.exp(-np.arange(1, tutorial.N_MAX, dtype=float) / TAU_EXP_1) + np.testing.assert_allclose(tutorial.an_acf_1, ref_acf_1) + # The autocorrelation is exponential, thus tau_exp = tau_int, and + # therefore + N_EFF_1 = tutorial.N_SAMPLES / (2 * TAU_EXP_1) + SEM_1 = np.sqrt(SIGMA_1 ** 2 / N_EFF_1) + + self.assertAlmostEqual(tutorial.sem, SEM_1, delta=0.1 * SEM_1) + self.assertAlmostEqual(tutorial.N_eff, N_EFF_1, delta=0.1 * N_EFF_1) + # for some reason, the integrated autocorrelation time is always higher + # than the exponential one, in the tutorial + self.assertAlmostEqual( + tutorial.tau_int, + TAU_EXP_1, + delta=0.1 * TAU_EXP_1) + + SIGMA_2 = np.sqrt(tutorial.EPS_2 ** 2 / (1 - tutorial.PHI_2 ** 2)) + TAU_EXP_2 = -1 / np.log(tutorial.PHI_2) + SEM_2 = np.sqrt(2 * SIGMA_2 ** 2 * TAU_EXP_2 / tutorial.N_SAMPLES) + # the point of the following value in the tutorial is that it is very + # inaccurate, thus the high tolerance + self.assertAlmostEqual(tutorial.sem_2, SEM_2, delta=0.2 * SEM_2) + + +if __name__ == "__main__": + ut.main()