% LaTeX template, xeCJK usepackage and Unicode text need XeLaTeX to compile.
% MiKTeX package can be downloaded at miktex.org, WinEdt can be downloaded at http://www.winedt.com/.
% WinEdt 6 and higher provide XeLaTeX and Unicode support.
% LaTeX template version 1.0.2, last revised on 2014-11-04.
\documentclass[10pt]{article}
% *************************** installed packages *************************
% AMS packages
\usepackage{amsfonts} % TeX fonts from the American Mathematical Society.
\usepackage{amsmath} % AMS mathematical facilities for LaTeX.
\usepackage{amssymb} % AMS symbols
\usepackage{amsthm} % Provide proclamations environment.
% graphics packages
\usepackage{graphics} % Standard LaTeX graphics.
\usepackage{tikz} % TikZ and PGF package for graphics
\usetikzlibrary{matrix} % matrix library of TikZ package
\usetikzlibrary{trees} % trees library of TikZ package
% support for foreign languages, esp. Chinese.
\usepackage{xeCJK} % Support for CJK (Chinese, Japanese, Korean) documents in XeLaTeX.
\setCJKmainfont{SimSun} % MUST appear when xeCJK is loaded.
%\setCJKmainfont{DFKai-SB} % 设置正文罗马族的CKJ字体，影响 \rmfamily 和 \textrm 的字体。此处设为“标楷体”。
%\setCJKmainfont{SimSun} % 设置正文罗马族的CKJ字体，影响 \rmfamily 和 \textrm 的字体。此处设为“宋体”。
%\setCJKmonofont{MingLiU} % 设置正文等宽族的CJK字体，影响 \ttfamily 和 \texttt 的字体。此处设为“细明体”。
%\renewcommand\abstractname{摘要} % 重定义摘要名：abstract->摘要。
%\renewcommand\appendixname{附录} % 重定义附录名：appendix->附录。
%\renewcommand\bibname{参考文献} % 重定义参考文献名：bibliography->参考文献。
%\renewcommand\contentsname{目录} % 重定义目录名：contents->目录。
%\renewcommand\refname{参考文献} % 重定义参考文献名：references->参考文献。
% miscellaneous packages
\usepackage[toc, page]{appendix} % Extra control of appendices.
\usepackage{clrscode} % Typesets pseudocode as in Introduction to Algorithms.
%\usepackage{courier} % Typesets program code.
\usepackage{epsfig}
\usepackage{eurosym} % Metafont and macros for Euro sign.
\usepackage{float} % Improved interface for floating objects.
\usepackage{fontspec} % Advanced font selection in XeLaTeX and LuaLaTeX.
\usepackage{indentfirst}
\usepackage{xcolor} % Driver-independent color extensions for LaTeX and pdfLaTeX.
% must-be-the-last packages
\usepackage[pagebackref]{hyperref} % Extensive support for hypertext in LaTeX; MUST be on the last \usepackage line in the preamble. [pagebackref] for page referencing; [backref] for section referencing.
% ********************** end of installed packages ***********************
% ************************** fullpage.sty ********************************
% This is FULLPAGE.STY by H.Partl, Version 2 as of 15 Dec 1988.
% Document Style Option to fill the paper just like Plain TeX.
\typeout{Style Option FULLPAGE Version 2 as of 15 Dec 1988}
\topmargin 0pt \advance \topmargin by -\headheight \advance
\topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt \evensidemargin \oddsidemargin \marginparwidth
0.5in
\textwidth 6.5in
% For users of A4 paper: The above values are suited for American 8.5x11in
% paper. If your output driver performs a conversion for A4 paper, keep
% those values. If your output driver conforms to the TeX standard (1in/1in),
% then you should add the following commands to center the text on A4 paper:
% \advance\hoffset by -3mm % A4 is narrower.
% \advance\voffset by 8mm % A4 is taller.
% ************************ end of fullpage.sty ***************************
% ************** Proclamations (theorem-like structures) *****************
% [section] option provides numbering within a section.
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}[section]
\newtheorem{definition}{Definition}[section]
\newtheorem{prop}{Proposition}[section]
\newtheorem{corollary}{Corollary}[section]
\newtheorem{remark}{Remark}[section]
\newtheorem{example}{Example}
% ************************************************************************
% ************* Solutions use a modified proof environment ***************
\newenvironment{solution}
{\begin{proof}[Solution]}
{\end{proof}}
% ************************************************************************
% ************* Frequently used commands as shorthand ********************
\newcommand{\norm}{|\!|}
% ************************************************************************
\begin{document}
\title{ \Huge{Book Summary: \it Econometrics for Dummies} }
\author{Yan Zeng}
\date{Version 1.0.3, last revised on 2016-08-04.}
\maketitle
\begin{abstract}
Summary of Pedace \cite{Pedace13a} and \cite{Pedace13b}.
\end{abstract}
\tableofcontents
\newpage
\part{Getting Started with Econometrics}
\section{Econometrics: The Economist's Approach to Statistical Analysis}
\section{Getting the Hang of Probability}
\section{Making Inferences and Testing Hypotheses}
{\bf Applicability of the Central Limit Theorem}.
$\bullet$ When the probability distribution of $X$ is normal, the distribution of $\overline{X}$ is exactly normally distributed regardless of sample size.
$\bullet$ When the probability distribution of $X$ is symmetrical, the CLT applies very well to small sample sizes (often as small as $10 \le n \le 25$).
$\bullet$ When the distribution of $X$ is asymmetrical, the approximation to a normal distribution becomes more accurate as $n$ becomes large.
Generally, a good convergence of the sample mean distribution to a normal distribution can be achieved with a sample size of 25 or more.
\medskip
{\bf The chi-squared distribution}. The chi-squared distribution is typically used with {\it variance} estimates and rests on the idea that you begin with a normally distributed random variable, such as $X \sim N(\mu_X, \sigma_X^2)$. With sample data, you estimate the variance of this random variable with
\[
s_X^2 = \frac{\sum_{i=1}^n (X_i - \overline{X})^2}{n-1}.
\]
The chi-squared distribution is obtained by
\[
\frac{(n-1)s_X^2}{\sigma_X^2} = \frac{\sum_{i=1}^n (X_i - \overline{X})^2}{\sigma_X^2} \sim \chi_{n-1}^2.
\]
The chi-squared distribution takes only nonnegative values and tends to be right-skewed. The extent of its skewness depends on the degrees of freedom or number of observations. The higher the degrees of freedom (more observations), the less skewed (more symmetrical) the chi-squared distribution.
\medskip
{\bf The $t$-distribution}. The $t$-distribution is derived from a ratio of a standard normal random variable and the square root of a $\chi^2$ random variable. It's bell-shaped symmetrical around zero, and approaches a normal distribution, as the degrees of freedom (number of observations) increases. When you take the ratio of the standard normal to the square root of your chi-squared distribution, you end up with a $t$-distribution:
\[
\frac{(\overline{X}-\mu_X)/\frac{\sigma_X}{\sqrt{n}}}{\sqrt{\frac{s_X^2}{\sigma_X^2}}} = \frac{\overline{X}-\mu_X}{\frac{\sigma_X}{\sqrt{n}}} \cdot \frac{\sigma_X}{s_X} = \frac{\overline{X}-\mu_X}{\frac{s_X}{\sqrt{n}}} \sim t_{n-1}.
\]
\medskip
{\bf The $F$-distribution}. The $F$-distribution is used to compare variances of two different normal distributions. It is derived from a ratio of two $\chi^2$ distributions divided by their respective degrees of freedom. The $F$-distribution tends to be right-skewed, with the amount of skewness depending on the degrees of freedom. As the degree of freedom in the numerator and denominator increase, the $F$-distribution approaches a normal distribution.
\[
\frac{\sum_{i=1}^n (X_i-\overline{X})^2/(n-1)}{\sum_{i=1}^m (Y_i-\overline{Y})^2/(m-1)} = \frac{s_X^2}{s_Y^2} \sim F_{(n-1), (m-1)}
\]
\part{Building the Classical Linear Regression Model}
\section{Understanding the Objectives of Regression Analysis}
{\it Model specification} consists of selecting an outcome of interest or dependent variable and one or more independent factors, as well as choosing an appropriate functional form.
{\it Spurious correlation} occurs when two variables coincidentally have a statistical relationship but one doesn't cause the other.
Causation cannot be proven by statistical results. Your results can be used to support a hypothesis of causality, but only after you've developed a model that is well grounded in economic theory and/or good common sense.
\medskip
In regression analysis, the random error term represents the difference between the observed value of your dependent variable and the conditional mean of the dependent variable derived from your model:
\[
\varepsilon = Y - E[Y | X_1, \cdots, X_n] = Y - E[Y|{\bf X}].
\]
The random error can result from one or more of the following factors:
\checkmark Insufficient or incorrectly measured data.
\checkmark A lack of theoretical insights to fully account for all the factors that affect the dependent variable.
\checkmark Applying an incorrect functional form; for example, assuming the relationship is linear when it's quadratic.
\checkmark Unobservable characteristics.
\checkmark Unpredictable elements of behavior.
\medskip
The equation $\varepsilon = Y - E[Y | {\bf X}]$ can be written more explicitly as
\[
Y({\bf x}; \omega) = E[Y|{\bf X} = {\bf x}] + \varepsilon({\bf x}; \omega), \; or \; \varepsilon({\bf x}; \omega) = Y({\bf x}; \omega) - E[Y|{\bf X} = {\bf x}]
\]
where $Y({\bf x}; \omega)$ is sampled according to the conditional distribution of $Y$ at ${\bf X}={\bf x}$. We typically assume $(\varepsilon({\bf x}; \cdot))_{{\bf x}}$ has identical variance or even i.i.d.. Note this is truly a strong assumption.
\medskip
{\it Cross-sectional data} contains measurements for individual observations at a given point in time.
\[
Y_i = \beta_0 + \sum_{k=1}^p \beta_k X_{ik} + \varepsilon_i.
\]
{\it Time-series data} contains measurements on one or more variables over time in a given space.
\[
Y_t = \beta_0 + \sum_{k=1}^p \beta_k X_{tk} + \varepsilon_t.
\]
{\it Panel data} (also referred to as {\it longitudinal data}) contains a time series for each cross-sectional unit in the sample.
\[
Y_{it} = \beta_0 + \sum_{k=1}^p \beta_k X_{itk} + \varepsilon_{it}.
\]
{\it Pooled cross-sectional data}. Simply because your dataset contains both a cross-sectional and time-series component doesn't make it a panel dataset. It isn't a panel dataset unless the same individual units are observed in each subsequent time period.
\section{Going Beyond Ordinary with the Ordinary Least Squares Technique}
{\bf Regression coefficients in a model with one independent variable:}
\[
\hat \beta_1 = \frac{\sum_{i=1}^n (Y_i - \overline{Y})(X_i-\overline{X})}{\sum_{i=1}^n(X_i-\overline{X})^2} = \frac{\hat{s}^2_{XY}}{\hat{s}^2_{X}}, \; \hat \beta_0 = \overline{Y} - \hat \beta_1 \overline{X}.
\]
Intercept term is usually ignored in applied work, because situations where all of the explanatory variables equal zero are unlikely to occur.
\medskip
{\bf Justifying the least squares principle}. In most situations, OLS remains the most popular technique for estimating regressions for the following three reasons:
$\bullet$ Using OLS is easier than the alternatives. Other techniques require more mathematical sophistication and more computing power.
$\bullet$ OLS is sensible. You can avoid positive and negative residuals canceling each other out and find a regression line that's as close as possible to the observed data points.
$\bullet$ OLS results have desirable characteristics.
\hspace{0.2in} \checkmark The regression line always passes through the sample means of $Y$ and $X$, or $\overline{Y} = \hat \beta_0 + \hat \beta_1 \overline{X}$ (the point $(\overline{X}, \overline{Y})$ falls on the line $y = \hat \beta_0 + \hat \beta_1 x$): by the definition of $\hat \beta_0$ and $\hat \beta_1$.
\hspace{0.2in} \checkmark The mean of the estimated (predicated) $Y$ value is equal to the mean value of the actual $Y$, or $\overline{\hat Y} = \overline{\hat \beta_0 + \hat \beta_1 X} = \hat \beta_0 + \hat \beta_1 \overline{X} = \overline{Y}$.
\hspace{0.2in} \checkmark The mean of the residuals is zero, or $\overline{\hat \varepsilon} = \overline{Y - (\hat \beta_0 + \hat \beta_1 X)}=\overline{Y}-(\hat \beta_0 + \hat \beta_1\overline{X}) = 0$.
\hspace{0.2in} \checkmark The residuals are uncorrelated with the predicted $Y$, or $\sum_{i=1}^n (\hat Y_i - \overline{Y}) \hat \varepsilon_i = 0$.
\hspace{0.2in} \checkmark The residuals are uncorrelated with observed values of the independent variable, or $\sum_{i=1}^n \hat \varepsilon_i X_i = 0$.
\medskip
{\bf Standardizing regression coefficients}. Comparing coefficient values is not as straightforward as you may first think. Here are a few reasons why:
$\bullet$ In standard OLS regression, the coefficient with the largest magnitude is not necessarily associated with ``the most important" variable.
$\bullet$ Coefficient magnitudes can be affected by changing the units of measurement; in other words, scale matters.
$\bullet$ Even variables measured on similar scales can have different amounts of variability.
\smallskip
If you want to compare coefficient magnitudes in a multiple regression, you need to calculate the {\it standardized regression coefficients}. You can do so in two ways:
$\bullet$ Calculating a $Z$-score for every variable of every observation and then performing OLS with the $Z$ values rather than the raw data.
$\bullet$ Obtaining the OLS regression coefficients using the raw data and then multiplying each coefficient by $\left(\frac{\hat\sigma_{X_k}}{\hat\sigma_Y}\right)$.
Mathematically, you transform the original regression equation $Y_i = \beta_0 + \beta_1 X_{i1} + \beta_2 X_{i2} + \cdots + \beta_p X_{ip} + \varepsilon_i$ to
\[
\frac{Y_i - \overline{Y}}{\hat\sigma_Y} = \beta_1 \left(\frac{X_{i1} - \overline{X_1}}{\hat \sigma_{X_1}} \right) \left(\frac{\hat\sigma_{X_1}}{\hat\sigma_Y}\right) + \beta_2 \left(\frac{X_{i2} - \overline{X_2}}{\hat \sigma_{X_2}} \right) \left(\frac{\hat\sigma_{X_2}}{\hat\sigma_Y}\right) + \cdots + \beta_p \left(\frac{X_{ip} - \overline{X_p}}{\hat \sigma_{X_p}} \right) \left(\frac{\hat\sigma_{X_p}}{\hat\sigma_Y}\right) + \frac{\hat \varepsilon_i}{\hat \sigma_Y}.
\]
where we have taken advantage of one of the desirable OLS properties, namely that the average residual is zero.
Note regular OLS coefficients and standardized regression coefficients do not have the same meaning. The standardized regression coefficient estimates the standard deviation change in your dependent variable for a $1$-standard-deviation change in the independent variable, holding other variables constant.
\medskip
{\bf Measuring goodness of fit}.
$\bullet$ {\it Explained sum of squares} (ESS), {\it residual sum of squares} (RSS), and {\it total sum of squares} (TSS):
\[
ESS = \sum_{i=1}^n (\hat Y_i - \overline{Y})^2, \;
RSS = \sum_{i=1}^n (Y_i - \hat Y_i)^2 = \sum_{i=1}^n \hat \varepsilon_i^2, \;
TSS = \sum_{i=1}^n (Y_i - \overline{Y})^2 = ESS + RSS.
\]
$\bullet$ {\it Coefficient of determination} ({\it R-squared}) and {\it adjusted $R$-squared} (adjusted by degrees of freedom):
\[
R^2 = \frac{ESS}{TSS} = 1-\frac{RSS}{TSS}, \;
R^2_{adj} = 1 - \frac{\frac{RSS}{n-p-1}}{\frac{TSS}{n-1}}.
\]
where $n$ is the number of observations, and $p$ is the number of independent variables in the model.
\smallskip
If you increase the number of explanatory variables in a regression model, your R-squared value increases or remains the same, but it can never cause your R-squared value to decrease. When you add more variables, you lose {\it degree of freedom} (the number of observations above and beyond the number of estimated coefficients). {\bf Fewer degrees of freedom make your estimates less reliable} (for more on this topic, turn to Chapter 6). In order to compare two models on the basis of R-squared (adjusted or not), the dependent variable and sample size must be the same.
\smallskip
Here are a few reasons why you shouldn't use R-squared (adjusted or not) as the only measure of your regression's quality:
$\bullet$ A regression may have a high R-squared but have no meaningful interpretation because the model equation is not supported by economic theory or common sense.
$\bullet$ Using a small data set or one that includes inaccuracies can lead to a high R-squared value but deceptive results.
$\bullet$ Obsessing over R-squared may cause you to overlook important econometric problems.
\smallskip
In economic settings, a high R-squared (close to 1) is more likely to indicate that something is wrong with the regression instead of showing that it's of high quality. High R-squared values may be associated with regressions that violate assumptions and/or have nonsensical results (coefficients with the wrong sign, unbelievable magnitudes, and so on.). When evaluating regression quality, give these outcomes more weight than the R-squared.
\section{Assumptions of OLS Estimation and the Gauss-Markov Theorem}
{\bf The OLS/CLRM assumptions and their intuition.}
$\bullet$ {\it The model is linear in parameters and has an additive error term}. Other techniques, such as {\it maximum likelihood} (ML) estimation, can be used when the function you need to estimate is not linear in parameters.
\smallskip
$\bullet$ {\it The value for the independent variables are derived from a random sample of the population and contain variability}.
Strictly speaking, the CLRM assumes that the values of the independent variables are fixed in repeated random samples. The more common version of the assumption is that the values of the independent variable are random from sample to sample but independent of the error term. The weaker version is equivalent asymptotically (with large samples).
This assumption isn't likely to hold when you use lagged values of your dependent variable as an independent variable ({\it autoregression}) or when the value of your dependent variable simultaneously affects the value of one (or more) of your independent variables ({\it simultaneous equations}). Therefore, OLS is inappropriate in these situations.
In practice, for each random sample $X_i$ we often observe $Y$ only once. So we either assume a simple parametric model, e.g. linear regression, or use points in a neighborhood of $X_i$ for averaging, e.g. K-nearest neighbor regression (KNN regression). See James et al. \cite[page~104]{JWHT14} for details.
\smallskip
$\bullet$ {\it No independent variable is a perfect linear function of any other independent variable(s) (no perfect collinearity)}.
If you have perfect collinearity, the software program you use to calculate regression results cannot estimate the regression coefficients, since perfect collinearity causes you to lose linear independence and the computer can't identify the unique effect of each variable. In applied cases, high collinearity is much more common than perfect collinearity.
\smallskip
$\bullet$ {\it The model is correctly specified and the error term has a zero conditional mean}.
$E[\varepsilon|X=x]=0$ means for given $x$, the residuals $\varepsilon(x) = y-(\beta_0+\beta_1x)$ oscillate around 0 with average equal to 0. Graphically, this means the values of the dependent variable oscillate around the regression line with averages falling on the regression line. This assumption may fail if you have {\it misspecification} (you fail to include a relevant independent variable or you use an incorrect functional form) or a {\it restricted dependent variable} (namely, a qualitative or limited dependent variable).
\smallskip
$\bullet$ {\it The error term has a constant variance (no heteroskedasticity)}.
Graphically, this means the ``scatteredness" of the values of the independent variable around the regression line is approximately the same everywhere. Heteroskedasticity is a common problem for OLS regression estimation, espcially with cross-sectional and panel data.
\smallskip
$\bullet$ {\it The values of the error term aren't correlated with each other (no autocorrelation or no serial correlation)}.
Graphically, no autocorrelation means the scatter plot of $(\varepsilon_{i-k},\varepsilon_i)_{i=k+1}^{\infty}$ spreads out homogeneously in all directions, for any $k\ge 1$. Autocorrelation can be quite common when you are estimating models with time-series data, because when observations are collected over time, they are unlikely to be independent from one another.
\medskip
{\bf The Gauss-Markov Theorem}. This theorem states that the ordinary least squares (OLS) estimators are the best linear unbiased estimators (BLUE) given the assumptions of the CLRM.
$\bullet$ {\it Linearity of OLS} (as a function of the observed $Y$ values):
\[
\hat \beta_1 = \sum_{i=1}^n c_i (Y_i - \overline{Y}), \; \hat \beta_0 = \overline{Y} - \left[\sum_{i=1}^n c_i(Y_i - \overline{Y})\right] \overline{X},
\]
where $c_i = \frac{X_i - \overline{X}}{\sum_{i=1}^n (X_i-\overline{X})^2}$, $i=1, \cdots, n$.
$\bullet$ {\it Unbiasedness}: $E[\hat\beta_1]=\beta_1$, $E[\hat\beta_0]=\beta_0$.
$\bullet$ {\it Best} means achieving the smallest possible variance among all similar estimators.
\[
\mbox{Var}(\hat\beta_1) = \frac{\sigma_{\varepsilon}^2}{\sum_{i=1}^n(X_i-\overline{X})^2}.
\]
\smallskip
When judging how good or bad an estimator is, econometricians usually evaluate the amount of bias and variance of that estimator. The BLUE property of OLS estimators is viewed as the gold standard.
Econometricians have devised methods to deal with failures of the CLRM assumptions, but they aren't always successful in proving that the alternative method produces a BLUE. In those cases, they usually settle for an {\it asymptotic} property known as {\it consistency}. Estimators are consistent if, as the sample size approaches infinity, the variance of the estimator gets smaller and the value of the estimator approaches the true population parameter value.
Also refer to Table 6-1: Summary of Gauss-Markov Assumptions \cite{Pedace13a}, page 19.
\section{The Normality Assumption and Inference with OLS}
{\bf The normality assumption}. The normality assumption in econometrics states that, for any given $X$ value, the error term follows a normal distribution with a zero mean and constant variance: $\varepsilon|X \sim N(0, \sigma^2_{\varepsilon})$.
The normality assumption isn't required for performing OLS estimation. It's necessary only when you want to produce confidence intervals and/or perform hypothesis tests with your OLS estimates.
In some applications, the assumption of a normal distribution for the error term may be difficult to justify. These situations typically involve a dependent variable $Y$ that has limited or highly skewed values. Econometricians have shown that with {\it large} sample sizes, normality is not a major issue because the OLS estimators are approximately normal even if the errors are not normal.
\medskip
{\bf The sampling distribution of OLS coefficients}. All OLS coefficients are a linear function of the error term. If you assume that the error term has a normal distribution, you're also assuming that the sampling distribution of the coefficients is normal:
\begin{eqnarray*}
\hat\beta_1
&=& \frac{\sum_{i=1}^n (X_i-\overline{X})(Y_i-\overline{Y})}{\sum_{i=1}^n (X_i-\overline{X})^2}
= \frac{\sum_{i=1}^n (X_i-\overline{X})(\beta_0 + \beta_1 X_i + \varepsilon_i - \overline{Y})}{\sum_{i=1}^n (X_i-\overline{X})^2}
= \beta_1 + \frac{\sum_{i=1}^n \varepsilon_i(X_i-\overline{X})}{\sum_{i=1}^n (X_i-\overline{X})^2} \\
&\sim& N\left(\beta_1, \sigma^2_{\hat\beta_1}\right),
\end{eqnarray*}
where $\sigma^2_{\hat\beta_1}=\frac{\sigma^2_{\varepsilon}}{\sum_{i=1}^n (X_i-\overline{X})^2}$ and
\[
\hat\beta_0 = \overline{Y} - \hat\beta_1\overline{X} = \beta_0 - \frac{\sum_{i=1}^n \varepsilon_i(X_i-\overline{X})}{\sum_{i=1}^n (X_i-\overline{X})^2}\overline{X}
\sim N\left(\beta_0, \sigma^2_{\hat\beta_0}\right).
\]
where $\sigma^2_{\hat\beta_0}=\frac{(\sum_{i=1}^n X_i^2) \sigma^2_{\varepsilon}}{n\sum_{i=1}^n (X_i-\overline{X})^2}$.
\medskip
{\bf OLS standard errors and the $t$-distribution}. In practice, the true variance of the error $\sigma^2_{\varepsilon}$ isn't known, but you can estimate it by calculating the {\it mean square error} (MSE):
\[
\hat\sigma^2_{\varepsilon} = \frac{RSS}{n-p-1} = \frac{\sum_{i=1}^n\hat\varepsilon_i^2}{n-p-1}.
\]
This provides the {\it standard errors of the coefficients}:
\[
\hat \sigma_{\hat \beta_1} = \frac{\hat \sigma_{\varepsilon}}{\sqrt{\sum_{i=1}^n(X_i-\overline{X})^2}}, \;
\hat \sigma_{\hat \beta_0} = \sqrt{\frac{\sum_{i=1}^n X_i^2}{n\sum_{i=1}^n(X_i-\overline{X})^2}} \cdot \hat\sigma_{\varepsilon}.
\]
The assumption that the error is normally distributed implies that the MSE and the estimated variances of the coefficients have a chi-squared ($\chi^2$) distribution with $n-p-1$ degrees of freedom. Therefore, for $k=0,1$,
\[
\frac{\hat\beta_k - \beta_k}{\hat\sigma_{\hat\beta_k}} \sim t_{n-p-1}.
\]
\medskip
{\bf Significance of individual regression coefficients}. You an report the statistical significance of your coefficients with either the {\it confidence interval approach} or the {\it test of significance approach}. The former provides you with a range of possible values for your estimator in repeated sampling, and the latter gives you a test statistic that's used to determine the likelihood of your hypothesis.
A {\it type I error} is rejecting a hypothesis that's true, and a {\it type II error} is failing to reject a hypothesis that's false. If you choose a higher level of significance, you increase the chances of committing a type I error. And if you choose a lower level of signifiance, you increase the chances of committing a type II error.
\medskip
{\bf Overall/joint significance}. The explained and unexplained variations from a regression model have a chi-squared distribution under the assumption that the conditional distribution of $Y$ is normal ($Y|X \sim N(\beta_0 + \beta_1X_{i1} + \beta_2X_{i2} + \cdots, \sigma_Y^2)$), which is equivalent to assuming that the error term is normally distributed ($\varepsilon|X \sim N(0,\sigma^2_{\varepsilon})$).
The R-squared value is a measure of overall fit for a regression model, but it doesn't tell you whether the amount of explained variation is statistically significant. Despite a low R-squared value, your model may explain a significant amount of variation in your dependent variable. The opposite may also be true; a high R-squared value may not be statistically significantly different from zero. In models with numerous independent variables, many of the variables can be individually statistically insignificant, yet they are collectively significant.
The null and alternative hypotheses to test for a regression model's overall significant are
\[
H_0: \beta_1 = \beta_2 = \cdots = \beta_p = 0,\;
H_1: \mbox{$H_0$ is not true.}
\]
Overall significance only examines the impact of the slope coefficients and is tested using the following $F$-statistic:
\[
F = \frac{\frac{ESS}{p}}{\frac{RSS}{n-p-1}} = \frac{\frac{R^2}{p}}{\frac{(1-R^2)}{(n-p-1)}} \sim F_{p, n-p-1}.
\]
For given R-squared value, smaller $p$ yields bigger $F$, which has the interpretation of ``same R-squared value with less explanatory variables has more significance".
The $F$-test can also be used to examine the joint significance of a subset of variables in a regression model that includes several independent variables:
\[
F = \frac{\frac{RSS_r-RSS_{ur}}{q}}{\frac{RSS_{ur}}{n-p-1}} = \frac{\frac{ESS_{ur} - ESS_r}{q}}{\frac{RSS_{ur}}{n-p-1}} \sim F_{q,n-p-1}
\]
where $RSS_r$ is the RSS for the {\it restricted} model (the model with fewer independent variables), $RSS_{ur}$ is the RSS for the {\it unrestricted} model (the model with more independent variables), $n$ is the number of sample measurements, $p$ is the number of independent variables in the unrestricted model, and $q$ is the number of independent variables contained in your unrestricted model that are not contained in your restricted model.
The $F$-test of overall significance is a special case of the more general test. In that case, $q=p$ because the restricted model contains no independent variables in a test of overall significance.
\medskip
The intuition of $F$-test is explained by Brooks \cite{Brooks08} as follows.
``To see why the test centres around a comparison of the residual sums of squares from the restricted and unrestricted
regressions, recall that OLS estimation involved choosing the model that minimised the residual sum of squares, with no constraints imposed. Now if, after imposing constraints on the model, a residual sum of squares results that is not much higher than the unconstrained model's residual sum of squares, it would be concluded that the restrictions were
supported by the data. On the other hand, if the residual sum of squares increased considerably after the restrictions were imposed, it would be concluded that the restrictions were not supported by the data and therefore that the hypothesis should be rejected.
It can be further stated that $RSS_r \ge RSS_{ur}$. Only under a particular set of very extreme circumstances will the residual sums of squares for the restricted and unrestricted models be exactly equal. This would be the case
when the restriction was already present in the data, so that it is not really a restriction at all."
\medskip
{\bf Forecasting}. Under the normality assumption, for a given value $X_0$ of the independent variable, the forecasted value of dependent variable $\hat Y_0 =\hat \beta_0 + \hat \beta_1 X_0$ is normally distributed:
\[
\hat Y_0 \sim N\left(\beta_0+\beta_1 X_0, \sigma^2_{\hat Y_0}\right)
\]
where
\[
\sigma^2_{\hat Y_0} = \sigma^2_{\varepsilon} \left[\frac{1}{n} + \frac{(X_0-\overline{X})^2}{\sum_{i=1}^n (X_i-\overline{X})^2}\right].
\]
In practice, we don't know the true variance of the error, so we use
\[
\hat\sigma^2_{\hat Y_0} = \hat\sigma^2_{\varepsilon} \left[\frac{1}{n} + \frac{(X_0-\overline{X})^2}{\sum_{i=1}^n (X_i-\overline{X})^2}\right],
\]
which has a chi-squared distribution with $n-p-1$ degrees of freedom.
Consequently, the confidence interval for the prediction is $\hat Y_0 \pm t_{\alpha/2,n-p-1} \hat\sigma_{\hat Y_0}$. A unique characteristic of this confidence interval is the changing standard error of the prediction; smallest at the mean value of $X$ and increasing exponentially as $X$ deviates from the mean.
\part{Working with the Classical Regression Model}
\section{Functional Form, Specification, and Structural Stability}
{\bf Functional Form}.
\smallskip
$\bullet$ {\it Dimension/unit/scale}. Change in absolute amount or in percentage?
\hspace{.2in} \checkmark Log-log model (elasticity, i.e. the estimated percentage change in the dependent variable for a percentage change in the independent variable).
\hspace{.2in} \checkmark Log-linear model (the estimated percentage change in the dependent variable for a unit change in the independent variable).
\hspace{.2in} \checkmark Linear-log model (the estimated unit change in the dependent variable for a percentage change in the independent variable).
\smallskip
$\bullet$ {\it Graph of the dependent-independent variable chart}.
\hspace{.2in} \checkmark Quadratic function (best for finding minimums and maximums).
\hspace{.2in} \checkmark Cubic function (good for inflexion).
\hspace{.2in} \checkmark Inverse function (limiting the value of the dependent variable).
\hspace{.2in} \checkmark Linear-log model (the impact of the independent variable on the dependent variable decreases as the value of the independent variable increases).
\medskip
{\bf Misspecification}.
\smallskip
$\bullet$ {\it Omitting relevant variables}. You have an omitted variable bias if an excluded variable has some effect on your dependent variable and it's correlated with at least one of your independent variables. The intuition is best illustrated by projection in Hilbert space.
\smallskip
$\bullet$ {\it Including irrelevant variable}. The estimated coefficients remain unbiased but the standard errors are increased--the estimated standard error for any given regression coefficient is given by
\[
\hat \sigma_{\hat\beta_k} = \sqrt{\frac{\hat\sigma^2_{\varepsilon}}{\sum_{i=1}^n (X_i - \overline{X})^2(1-R_k^2)}}
\]
where $R_k^2$ is the R-squared from the regression of $X_k$ on the other independent variables. Including irrelevant variables does not change $\hat\sigma^2_{\varepsilon}$ while increasing $R_k^2$.
Just because your estimated coefficient isn't statistically significant doesn't make it irrelevant. A well-specified model usually includes some variables that are statistically significant and some that aren't. Additionally, variables that aren't statistically significant can contribute enough explained variation to have no detrimental impact on the standard errors.
\medskip
{\bf Structural Stability}.
\smallskip
$\bullet$ {\it Perform a RESET to test the severity of specification issues}. Ramsey's {\it regression specification error test} (RESET) is conducted by adding a quartic function of the fitted values of the dependent variable ($\hat Y_i^2$, $\hat Y_i^3$, and $\hat Y_i^4$) to the original regression and then testing the joint significance of the coefficients for the added variables. The logic of using a quartic of your fitted values is that they serve as proxies for variables that may have been omitted. Because the proxies are essentially nonlinear functions of your $X$s, RESET is also testing misspecification from functional form.
\hspace{.2in} 1. Estimate the model you want to test for specification error. E.g. $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \varepsilon_i$.
\hspace{.2in} 2. Obtain the fitted values after estimating your model and estimate $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \alpha \hat Y_i^2 + \gamma \hat Y_i^3 + \delta \hat Y_i^4 + \varepsilon_i$.
\hspace{.2in} 3. Test the joint significance of the coefficients on the fitted values of $Y_i$ terms ($\alpha$, $\gamma$, and $\delta$) using an $F$-statistic.
A RESET allows you to identify whether misspecification is a serious problem with your model, but it doesn't allow you to determine the source.
\smallskip
$\bullet$ {\it Use the Chow test to determine structural stability}. Sometimes specification issues arise because the parameters of the model either aren't stable or they change. We can conduct a Chow test for structural stability between any two groups ($A$ and $B$) in just three steps:
\hspace{.2in} 1. Estimate your model combining all data and obtain the residual sum of squares ($RSS_r$) with degrees of freedom $n-p-1$.
\hspace{.2in} 2. Estimate your model separately for each group and obtain the residual sum of squares for group $A$, $RSS_{ur, A}$, with degrees of freedom $n_A - p - 1$ and the residual sum of squares for group $B$, $RSS_{ur, B}$, with degrees of freedom $n_B - p -1$.
\hspace{.2in} 3. Compute the $F$-statistic by using this formula:
\[
F = \frac{\frac{RSS_r - (RSS_{ur,A} + RSS_{ur,B})}{p+1}}{\frac{RSS_{ur,A}+RSS_{ur,B}}{n-2p-2}}.
\]
The null hypothesis for the Chow test is structural stability. The larger the $F$-statistic, the more evidence you have against structural stability and the more likely the coefficients are to vary from group to group. Note the result of the $F$-statistic for the Chow test assumes homoskedasticity. A large $F$-statistic only informs you that the parameters vary between the groups, but it doesn't tell you which specific parameter(s) is (are) the source(s) of the structural break.
\smallskip
$\bullet$ {\it Robustness/sensitivity analysis}. If the coefficients of your core variables aren't sensitive (maintain the same sign with similar magnitudes and levels of significance), then they are considered {\it robust}. Some variables, despite not being of primary interest (that is, despite not being core), are likely to be essential control variables that would be included in any analysis of your outcome of interest (you should rely on economic theory and your common sense here).
\section{Regression with Dummy Explanatory Variables}
{\bf Interpretation}.
\smallskip
$\bullet$ The coefficient for your dummy variables(s) in a regression containing a quantitative variable shifts the regression function up or down. The same holds true when there's more than one dummy variable.
\smallskip
$\bullet$ The inclusion of an interaction term in your econometric model allows the regression function to have a different intercept and slope for each group identified by your dummy variables. The coefficient for your dummy variable(s) in a regression shifts the intercept, and the coefficient for your interaction term changes the slope (which is the impact of your quantitative variable).
\smallskip
$\bullet$ The inclusion of interacted dummy variables in your econometric model allows the regression function to have different intercepts for each combination of qualitative attributes. The coefficients for your dummy variables and their interaction shift the intercept by the estimated magnitude.
\medskip
{\bf Testing for significance}.
\smallskip
$\bullet$ Testing the joint significance of a group of dummy variables in a gression model is accomplished by generalizing the $F$-test of overall significance to
\[
F = \frac{\frac{RSS_r-RSS_{ur}}{q}}{\frac{RSS_{ur}}{n-p-1}} = \frac{\frac{ESS_{ur} - ESS_r}{q}}{\frac{RSS_{ur}}{n-p-1}} \sim F_{q,n-p-1}
\]
where $RSS_r$ is the residual sum of squares for the {\it restricted} model (the model excluding the dummy variables), $RSS_{ur}$ is the residual sum of squares for the {\it unrestricted} model (the model including the dummy variables), $n$ is the number of sample measurements, $p$ is the number of independent variables in the unrestricted model, and $q$ is the number of dummy variables added in your unrestrictd model that are not contained in your restricted model.
\smallskip
$\bullet$ Using a dummy variable and interaction terms, a test of joint significance can be equivalent to performing a Chow test.
\hspace{.2in} 1. Create a dummy variable ($D$) that identifies any two groups suspected of a structural break.
\hspace{.2in} 2. Create interaction variables with your dummy variable and every other variable in your model.
\hspace{.2in} 3. Estimate the regression model that includes the quantitative, dummy, and interaction variables.
\hspace{.2in} 4. Test the joint significance of the dummy variable identifying the two groups and all the interaction terms that include this dummy variable.
\smallskip
The advantage of the dummy variable approach to testing for structural stability is that it allows you to identify the source of the difference between the groups. The disadvantage of the dummy variable approach is that it may not be practical if you're working with numerous independent variables.
\part{Violations of Classical Regression Model Assumptions}
\section{Multicollinearity}
{\bf Multicollinearity} refers to a linear relationship between two or more independent variables in a regression model. There are two types of multicollinearity:
{\it Perfect multicollinearity}. When perfectly collinear variables are included as independent variables, you can't use the OLS technique to estimate the value of the parameters. Your regression coefficients are indeterminate and their standard errors are infinite.
{\it High multicollinearity}. It's much more common than its perfect counterpart and can be equally problematic when it comes to estimating an econometric model. Technically, the presence of high multicollinearity doesn't violate any CLRM assumptions. Consequently, OLS estimates can be obtained and are BLUE with high multicollinearity. The larger variances (and standard errors) of the OLS estimators are the main reason to avoid high multicollinearity.
When econometricians point to a multicollinearity issue, they're typically referring to {\it high} multicollinearity rather than {\it perfect} multicollinearity. Most econometric software programs identify perfect multicollinearity and drop one (or more) variables prior to providing the estimation results.
\smallskip
$\bullet$ {\it Causes of multicollinearity include}
\hspace{.2in} \checkmark You use variables that are lagged values of one another.
\hspace{.2in} \checkmark You use variables that share a common time trend component.
\hspace{.2in} \checkmark You use variables that capture similar phenomena.
\smallskip
$\bullet$ {\it Consequences of high multicollinearity include}
\hspace{.2in} \checkmark Larger standard errors and insignificant $t$-statistics:
\[
\sigma^2_{\hat\beta_k} = \frac{\hat\sigma^2_{\varepsilon}}{\sum(X_{ik}-\overline{X}_k)^2(1-R^2_k)},
\]
where $\hat\sigma^2_{\varepsilon}$ is the mean squared error (MSE) and $R^2_k$ is the R-squared value from regressing $X_k$ on the other $X$s. Higher multicollinearity results in a larger $R^2_k$, which increases the standard error of the coefficient. Because the $t$-statistic associated with a coefficient is $t_k = \frac{\hat\beta_k}{\hat\sigma_{\hat\beta_k}}$, high multicollinearity also tends to result in insignificant $t$-statistics.
\hspace{.2in} \checkmark Coefficient estimates that are sensitive to changes in specification. If the independent variables are highly collinear, the estimates must emphasize small differences in the variables in order to assign an independent effect to each of them.
\hspace{.2in} \checkmark Nonsensical coefficient signs and magnitudes. With higher multicollinearity, the variance of the estimated coefficients increases, which in turn increases the chances of obtaining coefficient estimates with extreme values.
When two (or more) variables exhibit high multicollinearity, there's more uncertainty as to which variables should be credited with explaining variation in the dependent variable. For this reason, a high R-squared value combined with many statistically insignificant coefficients is a common consequence of high multicollinearity.
\medskip
{\bf Rule of thumb for identifying multicollinearity}. Because high multicollinearity doesn't violate a CLRM assumption and is a sample-specific issue, researchers typically don't use formal statistical tests to detect multicollinearity. Instead, they use two sample measurements as indicators of a potential multicollinearity problem.
\smallskip
$\bullet$ {\bf Pairwise correlation coefficients}. The sample correlation of two independent variables, $X_k$ and $X_j$, is calculated as
\[
r_{kj} = \frac{s_{kj}}{s_ks_j}.
\]
As a rule of thumb, correlation coefficients around 0.8 or above may signal a multicollinearity problem. Other evidence you should also check include insignificant $t$-statistics, sensitive coefficient estimates, and nonsensical coefficient signs and values.
Note the pairwise correlation coefficients only identify the linear relationship of two variables. It does not check linear relationship among more than two variables.
\smallskip
$\bullet$ {\bf Auxiliary regression and the variance inflation factor (VIF)}. A VIF for any given independent variable is calculated by
\[
VIF_k = \frac{1}{1-R_k^2}
\]
where $R_k^2$ is the R-squared value obtained by regressing independent variable $X_k$ on all the other independent variables in the model.
As a rule of thumb, VIFs greater than 10 signal a highly likely multicollinearity problem, and VIFs between 5 and 10 signal a somewhat likely multicollinearity issue. Remember to check also other evidence of multicollinearity (insignificant $t$-statistics, sensitive or nonsensical coefficient estimates, and nonsensical coefficient signs and values). A high VIF is only an indicaotr of potential multicollinearity, but it may not result in a large variance for the estimator if the variance of the independent variable is also large.
\medskip
{\bf Resolving multicollinearity issues}. If the primary purpose of your study is to estimate a model for prediction or forecasting, then the best solution may be to do nothing. If you want to obtain reliable estimates of the individual parameters in the model, you need to be more concerned with multicollinearity. (But you shouldn't modify your model if the $t$-statistics of the suspect variables(s) are greater than 2 {\it and} the coefficient signs and magnitudes make economic sense.)
We should take a holistic approach that considers the benefits of eliminating high correlation between the independent variables against the costs of addressing an issue that's specific to the sample you're using rather than the population of interest. Once we decide to resolve the multicollinearity issue, we have several options:
\smallskip
$\bullet$ {\bf Acquire more data}. High multicollinearity may be unique to your sample, so the acquisition of additional data is a potential solution. But don't automatically assume a ``more is better" mentality when building your database, since the collection of additional data may be costly or could inadvertently result in a change of your population.
\smallskip
$\bullet$ {\bf Use a new model}.
\hspace{.2in} \checkmark {\it First-differencing}. Its use is limited to models utilizing time-series or panel data. It also has its cost: 1) losing observations; 2) losing variation in your independent variables (resulting in insignificant coefficients); 3) changing the specification (possibly resulting in misspecification bias).
\hspace{.2in} \checkmark {\it The composite index variable}. But never combine variables into an index that would, individually, be expected to have opposite signs.
\smallskip
$\bullet$ {\bf Expel the problem variables(s)}. In case of severely high multicollinearity (correlation coefficients greater than 0.9), you don't have to follow any statistical rationale for choosing to drop one variable over another. If you're using VIFs to detect multicollinearity, a variable with a VIF greater than 10 is usually the most likely to be dropped. A smaller MSE usually signals that the statistical benefits of dropping the variable exceed the costs of specification bias. Save this method as a last resort and place theoretical considerations above purely statistical justifications.
\section{Heteroskedasticity}
{\bf Homoskedasticity} is expressed as $Var(\varepsilon|{\bf X}_i) = \sigma^2_{\varepsilon} = \mbox{a constant for all $i$}$ ($i=1,2,\cdots,N$), where ${\bf X}_i$ represents a vector of values for each individual and for all the independent variables. {\bf Heteorskedasticity} is expressed as $Var(\varepsilon|{\bf X}_i) = \sigma^2_{i\varepsilon}$ ($i=1,2,\cdots,N$).
\medskip
{\bf The consequences of heteroskedasticity}. In the presence of heteroskedasticity, the OLS estimators may not be efficient (achieve the smallest variance). In addition, the estimated standard errors of the coefficients will be biased, which results in unreliable hypothesis tests ($t$-statistics). The OLS estimates, however, remain unbiased.
Under the assumption of homoskedasticity, for model $Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i$,
\[
Var(\hat\beta_1) = \frac{\sigma^2_{\varepsilon}}{TSS_X}
\]
where $TSS_X=\sum (X_i - \overline{X})^2$. Without the homoskedasticity assumption, the variance of $\beta_1$ is
\[
Var(\hat\beta_1) = \frac{\sum (X_i-\overline{X})^2\sigma^2_{i\varepsilon}}{TSS_X^2}
\]
where $\sigma^2_{i\varepsilon}$ is the heteroskedastic variance of the error. The $t$-statistic for coefficients is calculated with
\[
t = \frac{\mbox{estimated $\beta$} - \mbox{hypothesized $\beta$}}{\mbox{std error}}.
\]
Therefore, any bias in the standard error estimate is passed on to your $t$-statistics and conclusions about statistical significance.
Heteroskedasticity is a common problem for OLS regression estimation, especially with cross-sectional and panel data. You usually have no way to know in advance if it's going to be present, and theory is rarely useful in anticipating its presence.
\medskip
{\bf Detecting heteroskedasticity with residual analysis}. The challenge to identifying heteroskedasticity is that you can only know $\sigma^2_{i\varepsilon}$ if you have the entire population corresponding to the chosen independent variables ($X$s). In practice, you'll be using a sample with only a limited number of observations for a particular $X$. Consequently, in applied situations the detection of heteroskedasticity relies on your intuition, prior empirical work, educated guesswork, or even sheer speculation.
\smallskip
$\bullet$ {\bf Examining the residuals in graph form}.
\smallskip
$\bullet$ {\bf The Breusch-Pagan test}. This test assumes that heteroskedasticity may be a linear function of all the independent variables in the model: $\varepsilon_i^2 = \alpha_0 + \alpha_1 X_{i1} + \cdots + \alpha_p X_{ip} + u_i$. The values for $\varepsilon_i^2$ aren't known in practice, so the $\hat{\varepsilon}_i^2$ are calculated from the residuals and used as proxies for $\varepsilon_i^2$. Generally, the BP test is based on the estimation of $\hat{\varepsilon}_i^2 = \alpha_0 + \alpha_1 X_{i1} + \cdots + \alpha_p X_{ip} + u_i$. Alternatively, a BP test can be performed by estimating $\hat{\varepsilon}_i^2 = \delta_0 + \delta_1 \hat{Y}_i$, where $\hat{Y}_i = \hat{\beta}_0 + \hat{\beta}_1 X_{i1} + \cdots + \hat{\beta}_p X_{ip}$. Here's how to perform a BP test:
\hspace{.2in} 1. Estimate your model, $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \beta_p X_{ip} + \varepsilon_i$, using OLS.
\hspace{.2in} 2. Obtain the predicted $Y$ values $(\hat{Y}_i)$ after estimating the model.
\hspace{.2in} 3. Estimate the auxiliary regression, $\hat{\varepsilon}_i^2 = \delta_0 + \delta_1 \hat{Y}_i$, using OLS.
\hspace{.2in} 4. Retain the R-squared value $R^2_{\hat{\varepsilon}^2}$, from this auxiliary regression.
\hspace{.2in} 5. Calculate the $F$-statistic, $F=\frac{\frac{R^2_{\hat{\varepsilon}^2}}{1}}{\frac{(1-R^2_{\hat{\varepsilon}^2})}{n-2}}$, or the chi-squared statistic, $\chi^2 = nR^2_{\hat{\varepsilon}^2}$. If either of these test statistics is significant, then you have evidence of heteroskedasticity.
\smallskip
$\bullet$ {\bf The White test}. The White test assumes that heteroskedasticity may be a linear function of all the independent variables, a function of their squared values, and a function of their cross products:
\[
\hat{\varepsilon}_i^2 = \alpha_0 + \alpha_1 X_{i1} + \cdots + \alpha_p X_{ip} + \alpha_{p+1} X_{i1}^2 + \cdots + \alpha_{2p} X_{ip}^2 + \alpha_{2p+1}(X_{i1}X_{i2}) + \cdots + u_i,
\]
where $\hat{\varepsilon}_i^2$ are calculated from the residuals and used as proxies for $\varepsilon_i^2$. Alternatively, a White test can be performed by estimating $\hat{\varepsilon}_i^2 = \delta_0 + \delta_1 \hat{Y}_i + \delta_2 \hat{Y}_i^2$ where $\hat{Y}_i$ represents the predicted values from $\hat{Y}_i = \hat{\beta}_0 + \hat{\beta}_1 X_{i1} + \cdots + \hat{\beta}_p X_{ip}$. Here's how to perform a White test:
\hspace{.2in} 1. Estimate your model, $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \beta_p X_{ip} + \varepsilon_i$, using OLS.
\hspace{.2in} 2. Obtain the predicted $Y$ values $(\hat{Y}_i)$ after estimating your model.
\hspace{.2in} 3. Estimate the model $\hat{\varepsilon}_i^2 = \delta_0 + \delta_1 \hat{Y}_i + \delta_2 \hat{Y}_i^2$ using OLS.
\hspace{.2in} 4. Retain the R-squared value $(R_{\hat{\varepsilon}^2}^2)$ from this regression.
\hspace{.2in} 5. Calculate the $F$-statistic, $F=\frac{\frac{R^2_{\hat{\varepsilon}^2}}{2}}{\frac{(1-R^2_{\hat{\varepsilon}^2})}{n-3}}$, or the chi-squared statistic, $\chi^2 = nR^2_{\hat{\varepsilon}^2}$. If either of these test statistics is significant, then you have evidence of heteroskedasticity.
\smallskip
$\bullet$ {\bf The Goldfeld-Quandt test}. The Goldfeld-Quandt (GQ) test begins by assuming that a defining point exists and can be used to differentiate the variance of the error term. Sample observations are divided into two groups, and evidence of heteroskedasticity is based on a comparison of the residual sum of squares ($RSS$) using the $F$-statistic.
\hspace{.2in} 1. Estimate your model separately for each group and obtain the residual sum of squares for Group A ($RSS_{A}$) and the residual sum of squares for Group B ($RSS_B$).
\hspace{.2in} 2. Compute the $F$-statistic by
\[
F = \frac{\frac{RSS_A}{n-p-1}}{\frac{RSS_B}{n-p-1}}.
\]
The null hypothesis for the GQ test is homoskedasticity. The larger the $F$-statistic, the more evidence you'll have against the homoskedasticity assumption.
\smallskip
$\bullet$ {\bf The Park test}. The Park test assumes that the heteorskedasticity may be proportional to some power of an independent variable $(X_k)$ in the model: $\sigma^2_{i\varepsilon} = \sigma^2_{\varepsilon} X^{\alpha}_{ik}$.
\hspace{.2in} 1. Estimate the model $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \beta_p X_{ip} + \varepsilon_i$ using OLS.
\hspace{.2in} 2. Obtain the squared residuals, $\hat{\varepsilon}^2_i$, after estimating your model.
\hspace{.2in} 3. Estimate the model $\ln \hat{\varepsilon}_i^2 = \gamma + \alpha \ln X_{ik} + u_i$ using OLS.
\hspace{.2in} 4. Examine the statistical significance of $\alpha$ using the $t$-statistic: $t = \frac{\hat{\alpha}}{\hat{\sigma}_{\hat{\alpha}}}$. If the estimate of $\alpha$ coefficient is statistically significant, then you have evidence of heteroskedasticity.
\medskip
{\bf Correcting your regression model for the presence of heteroskedasticity}.
\smallskip
$\bullet$ {\bf Weighted least squares (WLS)}. The goal of the WLS transformation is to make the error term in the original econometric model homoskedastic. First, you assume that the heteroskedasticity is determined proportionally from some function of the independent variables: $Var(\varepsilon|{\bf X}_i) = \sigma^2_{\varepsilon} h({\bf X}_i)$. Then you use knowledge of this relationship to divide both sides of the original model by the component of heteroskedasticity that give the error term a constant variance. More specifically, the objective of OLS is
\[
\min \sum \left(Y_i - \hat{\beta}_0 - \hat{\beta}_1 X_{i1} - \cdots - \hat{\beta}_p X_{ip} \right)^2.
\]
The objective of WLS is
\[
\min \sum \frac{\left(Y_i - \hat{\beta}_0 - \hat{\beta}_1 X_{i1} - \cdots - \hat{\beta}_p X_{ip} \right)^2}{h({\bf X}_i)}.
\]
In practice, knowing the exact functional form of $h({\bf X}_i)$ is impossible. In applied settings, the exponential function is the most common approach to modeling heteroskedasticity: $Var(\varepsilon|{\bf X}_i) = \sigma^2_{\varepsilon} \exp(\alpha_0 + \alpha_1 X_{i1} + \cdots + \alpha_p X_{ip})$.
\hspace{.2in} 1. Estimate the original model, $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \beta_p X_{ip} + \varepsilon_i$, and obtain the residuals, $\hat{\varepsilon}_i$.
\hspace{.2in} 2. Square the residuals and take their natural log to generate $\ln \hat{\varepsilon}_i^2$.
\hspace{.2in} 3. Estimate the regression $\ln \hat{\varepsilon}_i^2 = \gamma + \delta_1 X_{i1} + \cdots + \delta_p X_{ip} + v_i$ or $\ln \hat{\varepsilon}_i^2 = \gamma + \phi_1 \hat{Y}_i + \phi_2 \hat{Y}_i^2 + u_i$ and obtain the fitted values: $\hat{g}_i = \hat{\gamma} + \hat{\phi}_1 \hat{Y}_i + \hat{\phi}_2 \hat{Y}_i^2$.
\hspace{.2in} 4. Take the inverse natural log of the fitted residuals $\exp(\hat{g}_i)$ to obtain $\hat{h}_i$.
\hspace{.2in} 5. Estimate the regression $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \beta_p X_{ip} + \varepsilon_i$ by WLS using $\hat{h}_i$ as weights.
\smallskip
If the proposed model of heteroskedasticity is misspecified, then WLS may not be more efficient than OLS. The problem is that misspecificaiton of the heteroskedasticity is difficult to identify. A large difference between OLS and WLS coefficients is more likely to imply that the model suffers from functional form specification bias than to suffer from heteroskedasticity.
\smallskip
$\bullet$ {\bf Robust standard errors (White-corrected standard errors, heteroskedasticity-corrected standard errors)}. In a model with one independent variable and homoskedasticity, the variance of the estimator can be reduced to $Var(\hat{\beta}_1) = \sigma^2_{\varepsilon} \sum c_i^2$; with heteroskedasticity, the variance of the estimator is $Var(\hat{\beta}_i) = \sum c_i^2 \sigma^2_{i\varepsilon}$. In applied settings, the squared residuals $(\hat{\varepsilon}^2_i)$ are used as estimates of $\sigma^2_{i\varepsilon}$.
In a model with one independent variable, the robust standard error is
\[
se(\hat{\beta}_i)_{HC} = \sqrt{\frac{\sum(X_i-\overline{X})^2\hat{\varepsilon}_i^2}{\left(\sum (X_i - \overline{X})^2\right)^2} }.
\]
Generalizing this result to a multiple regression model, the robust standard error is
\[
se(\hat{\beta}_k)_{HC} = \sqrt{ \frac{ \sum \hat{\omega}^2_{ik} \hat{\varepsilon}^2_i }{\left(\sum \hat{\omega}^2_{ik} \right)^2} }
\]
where the $\hat{\omega}^2_{ik}$ are the residuals obtained from the auxiliary regression of $X_j$ on all the other independent variables. Here's how to calculate robust standard errors:
\hspace{.2in} 1. Estimate your original multivariate model, $Y_i = \beta_0 + \beta_1 X_{i1} + \cdots + \beta_p X_{ip} + \varepsilon_i$, and obtain the squared residuals, $\hat{\varepsilon}^2_i$.
\hspace{.2in} 2. Estimate $p$ auxiliary regressions of each independent variable on all the other independent variables and retain all $p$ squared residuals $(\hat{\omega}^2_{ik})$.
\hspace{.2in} 3. For any independent variable, calculate the robust standard errors:
\[
se(\hat{\beta}_k)_{HC} = \sqrt{ \frac{ \sum \hat{\omega}^2_{ik} \hat{\varepsilon}^2_i }{\left(\sum \hat{\omega}^2_{ik} \right)^2} }.
\]
\smallskip
Numerous versions of robust standard errors exist for the purpose of improving the statistical properties of the heteroskedasticity correction; no form of robust standard error is preferred above all others.
\section{Autocorrelation}
{\bf Patterns of autocorrelation}. The CLRM assumes there's no autocorrelation: $Cov(\varepsilon_t, \varepsilon_s) = 0$ or $Corr(\varepsilon_t, \varepsilon_s) = 0$ for all $t\ne s$. When the error term exhibits no autocorrelation, the positive and negative error values are random.
If autocorrelation is present, positive autocorrelation is the most likely outcome. {\it Positive autocorrelation} occurs when an error of a given sign tends to be followed by an error of the same sign, which is called {\it sequencing}. {\it Negative autocorrelation} occurs when an error of a given sign tends to be followed by an error of the opposite sign, which is called {\it switching}.
When you're drawing conclusions about autocorrelation using the error pattern, all other CLRM assumptions must hold, especially the assumption that the model is correctly specified. If a model isn't correctly specified, you may mistakenly identify the model as suffering from autocorrelation. Misspecification is a more serious issue than autocorrelation.
\medskip
{\bf Effect of autoregressive errors}. In the presence of autocorrelation, the OLS estimators may not be efficient. In addition, the estimated standard errors of the coefficients are biased, which results in unreliable hypothesis tests ($t$-statistics). The OLS estimates, however, remain unbiased.
Typically, autocorrelation is assumed to be represented by a {\it first-order autoregression}:
\[
Y_t = \beta_0 + \sum_{i=1}^p \beta_i X_{ti} + \varepsilon_t
\]
with
\[
\varepsilon_t = \rho \varepsilon_{t-1} + u_t,
\]
where $-1 < \rho < 1$ and $u_t$ is a random error that satisfies the CLRM assumptions; namely $E[u_t|\varepsilon_{t-1}]=0$, $Var(u_t|\varepsilon_{t-1})=\sigma_u^2$, and $Cov(u_t, u_s) = 0$ for all $t\ne s$.
By repeated substitution, we obtain
\[
\varepsilon_t = u_t + \rho u_{t-1} + \rho^2 u_{t-2} + \rho^3 u_{t-3} + \cdots.
\]
Therefore
\[
E[\varepsilon_t] = 0, \; Var(\varepsilon_t) = \sigma_u^2 + \rho^2 \sigma_u^2 + \rho^4 \sigma_u^2 + \cdots = \frac{\sigma_u^2}{1-\rho^2}.
\]
The stationarity assumption ($|\rho|<1$) is necessary to constrain the variance from becoming an infinite value. OLS assumes no autocorrelation; that is, $\rho = 0$ in the expression $\sigma^2_{\varepsilon} = \frac{\sigma_u^2}{1-\rho^2}$. Consequently, in the presence of autocorrelation, the estimated variances and standard errors from OLS are underestimated.
\medskip
{\bf Test for autocorrelation}.
\smallskip
$\bullet$ {\bf Graphical inspection of residuals}. Look for {\it sequencing} or {\it switching} of residual errors if autocorrelation is present.
\smallskip
$\bullet$ {\bf The run test (the Geary test)}. You want to use the run test if you're uncertain about the nature of the autoregressive process (no assumptions about the $\rho$ values).
A {\it run} is defined as a sequence of positive or negative residuals. The hypothesis of no autocorrelation isn't sustainable if the residuals have too many or too few runs.
The most common version of the test assumes that runs are distributed normally. If the assumption of no autocorrelation is sustainable, with 95\% confidence, the number of runs should be between
\[
\mu_r \pm 1.96 \sigma_r
\]
where $\mu_r$ is the expected number of runs and $\sigma_r$ is the standard deviation. These values are calculated by
\[
\mu_r = \frac{2T_1 T_2}{T_1 + T_2} + 1, \; \sigma_r = \sqrt{ \frac{2T_1T_2(2T_1T_2-T_1-T_2)}{(T_1+T_2)^2(T_1+T_2-1)} }
\]
where $r$ is the number of observed runs, $T_1$ is the number of positive residuals, $T_2$ is the number of negative residuals, and $T$ is the total number of observations.
If the number of observed runs is below the expected interval, it's evidence of positive autocorrelation; if the number of runs exceeds the upper bound of the expected interval, it provides evidence of negative autocorrelation.
\smallskip
$\bullet$ {\bf The Durbin-Watson test for $AR(1)$ processes}. The Durbin-Watson (DW) test begins by assuming that if autocorrelation is present, then it can be described by an $AR(1)$ process:
\[
Y_t = \beta_0 + \sum_{i=1}^p \beta_i X_{ti} + \varepsilon_t, \; \varepsilon_t = \rho \varepsilon_{t-1} + u_t.
\]
The value produced by the DW test is called {\it d statistic} and is calculated as follows:
\[
d = \frac{\sum_{t=2}^T (\hat{\varepsilon}_t - \hat{\varepsilon}_{t-1})^2}{\sum_{t=1}^T \hat{\varepsilon}_t^2}
= \frac{\sum_{t=2}^T \hat{\varepsilon}_t^2}{\sum_{t=1}^T \hat{\varepsilon}_t^2} + \frac{\sum_{t=2}^T \hat{\varepsilon}_{t-1}^2}{\sum_{t=1}^T \hat{\varepsilon}_t^2} - \frac{2\sum_{t=2}^T\hat{\varepsilon}_t \hat{\varepsilon}_{t-1}}{\sum_{t=1}^T \hat{\varepsilon}_t^2}
\approx 1 + 1 - \frac{2\frac{\hat{\rho}\hat{\sigma}_u^2}{1-\hat{\rho}^2}}{\frac{\hat{\sigma}_u^2}{1-\hat{\rho}^2}} \approx 2(1-\hat{\rho}).
\]
where $T$ represents the last observation in the time series.
From the approximate formula $d \approx 2(1-\hat{\rho})$, the closer $d$ is to 2, the stronger the evidence of no autocorrelation; the closer $d$ is to 0, the more likely positive autocorrelation. If $d$ is closer to 4, then no autocorrelation is rejected in favor of negative autocorrelation.
The DW test has no unique critical value defining the point at which you reject the null hypothesis of no autocorrelation. However, it does have a zone of indecision defined by a lower bound ($d_L$) and upper bound ($d_u$) that depend on the number of observations and the number of estimated coefficients $(p+1)$ in the original model:
\[
\begin{cases}
\mbox{Reject $H_0$: $\rho>0$} & 0 < d < d_L \\
\mbox{indecision} & d_L \le d \le d_U \\
\mbox{Fail to reject $H_0$: No autocorrelation} & d_U < d < 4 - d_L \\
\mbox{indecision} & 4-d_U \le d \le 4-d_L \\
\mbox{Reject $H_0$: $\rho<0$} & 4-d_L < d < 4.
\end{cases}
\]
The DW $d$-statistic is the most popular test for autocorrelation, but it's limited to identifying $AR(1)$ autocorrelation. It's a good initial test, but additional testing may be required to rule out other forms of autocorrelation. Furthermore, a $d$-statistic that ends up in the indecision zone requires an alternative test to achieve a more conclusive result.
\smallskip
$\bullet$ {\bf The Breusch-Godfrey test for $AR(q)$ processes}. The Breusch-Godfrey (BG) test begins by assuming that if autocorrelation is
present, then it can be described by an $AR(q)$ process:
\[
Y_t = \beta_0 + \sum_{i=1}^p \beta_i X_{ti} + \varepsilon_t, \;
\varepsilon_t = \sum_{j=1}^q \rho_j\varepsilon_{1-j} + u_t
\]
where $1 \le q < T$. A special case of this test with $q=1$ is known as {\it Durbin's alternative statistic}.
You can perform a BG test by following these steps:
\hspace{.2in} 1. Estimate the model $Y_t = \beta_0 + \sum_{i=1}^p \beta_i X_{ti} + \varepsilon_t$ using OLS.
\hspace{.2in} 2. Obtain the residual values, $\hat{\varepsilon}_t$, after estimating your model.
\hspace{.2in} 3. Estimate the auxiliary regression $\hat{\varepsilon}_t = \alpha_0 + \sum_{i=1}^p \alpha_i X_{ti} + \sum_{j=1}^q \rho_j\hat{\varepsilon}_{t-j} + u_t$ using OLS.
\hspace{.2in} 4. Retain the R-squared value, $R^2_{\hat{\varepsilon}}$, from this regression.
\hspace{.2in} 5. Calculate the $F$-statistic for joint significance of $\hat{\rho}_1$, $\hat{\rho}_2$, $\cdots$, and $\hat{\rho}_q$ or the chi-squared statistic $\chi^2 = (n-q) R^2_{\hat{\varepsilon}}$ with $q$ degrees of freedom.
If the $F$ or chi-squared test statistics are significant, then you have evidence of autocorrelation. If not, you fail to reject the null hypothesis of no autocorrelation, which is $H_0: \rho_1 = \rho_2 = \cdots = \rho_q = 0$.
\medskip
{\bf Remedying harmful autocorrelation}.
\smallskip
$\bullet$ {\bf Feasible generalized least squares (FGLS)}. There are two FGLS techniques: the Cochrane-Orcutt (CO) transformation and the Prais-Winsten (PW) transformation. They transform the original model with autocorrelation into one without autocorrelation by {\it quasi-differencing}. If the proposed $AR(1)$ model of autocorrelation, $\varepsilon_t = \rho \varepsilon_{t-1} + u_t$, isn't correct, then you have no guarantee of getting more accurate standard errors with FGLS thanOLS.
Here's how to apply either the CO or PW technique:
\hspace{.2in} 1. Estimate your original model, $Y_t = \beta_0 + \sum_{i=1}^p \beta_i X_{ti} + \varepsilon_t$, and obtain the residuals $\hat{\varepsilon}_t$.
\hspace{.2in} 2. Use the residuals to estimate $\rho$ by performing one of the following calculations:
\hspace{.6in} $\maltese$ $\hat{\rho} = \frac{\sum_{t=2}^T \hat{\varepsilon}_t \hat{\varepsilon}_{t-1} }{\sum_{t=1}^T \hat{\varepsilon}^2_t}$. This calculation can be used in large samples but may have significant error in smaller samples.
\hspace{.6in} $\maltese$ $\hat{\rho} = 1 - \frac{d}{2}$. This calculation, known as {\it Thiel's estimator}, can be used with smaller samples.
\hspace{.6in} $\maltese$ Estimate $\hat{\varepsilon}_t = \rho \hat{\varepsilon}_{t-1} + u_t$ and obtain $\hat{\rho}$ from the regression. This method is the most common for estimating $\rho$ but is recommended only with larger samples.
\hspace{.2in} In practice, knowing the exact value of $\rho$ is impossible. In applied settings, you use the estimated value for $\rho$ (that is, $\hat{\rho}$) to transform the model.
\hspace{.2in} 3. Estimate the quasi-differenced CO or PW regression using $\hat{\rho}$ in place of $\rho$:
\[
Y_t - \rho Y_{t-1} = \beta_0(1-\rho) + \sum_{i=1}^p \beta_i (X_{ti}-X_{(t-1)i}) + u_t
\]
The CO transformation sets $Y_t^*=Y_t - \rho Y_{t-1}$ and $\varepsilon_t^* = u_t$ so that the regression equation becomes
\[
Y_t^* = \beta_0^* + \sum_{i=1}^p \beta_i^* X^*_{ti} + \varepsilon_t^*.
\]
The PW transformation maintains the CO structure with the exception of the first observation:
\[
Y_1^* = (\sqrt{1-\rho^2}) Y_1, \; X_1^* = (\sqrt{1-\rho^2}) X_1, \; \varepsilon_t^* = (\sqrt{1-\rho^2}) \varepsilon_1.
\]
In large samples, the difference between the CO and PW estimates is usually small. In small samples, however, the difference can be significant.
\smallskip
$\bullet$ {\bf Serial correlation robust standard errors}. Estimating the model using OLS and adjusting the standard errors for autocorrelation has become more popular than other correction methods. There are two reasons for this: (1) The serial correlation robust standard errors can adjust the results in the presence of a basic $AR(1)$ process or a more complex $AR(q)$ process, and (2) only the biased portion of the results (the standard errors) are adjusted, while the unbiased estimates (the coefficients) are untouched, so no model transformation is required.
Adjusting the OLS standard errors for autocorrelation produces {\it serial correlation robust standard errors}. These are also referred to as {\it Newey-West (NW) standard errors} and can be calculated by applying the following steps:
\hspace{.2in} 1. Estimate your original model $Y_t = \beta_0 + \sum_{i=1}^p \beta_i X_{ti} + \varepsilon_t$ and obtain the residuals: $\hat{\varepsilon}_t$.
\hspace{.2in} 2. Estimate the auxiliary regression $X_{t1} = \alpha_0 + \sum_{i=2}^p \alpha_i X_{ti} + r_t$ and retain the residuals: $\hat{r}_t$.
\hspace{.2in} 3. Find the intermediate adjustment factor, $\hat{\alpha}_t = \hat{r}_t \hat{\varepsilon}_t$, and decide how much serial correlation (the number of lags) you're going to allow. A Breusch-Godfrey test can be useful in making this determination.
\hspace{.2in} 4. Obtain the error variance adjustment factor, $\hat{v} = \sum_{t=1}^T \hat{\alpha}_t^2 + 2 \sum_{h=1}^g \left[1 - \frac{h}{g+1}\right]\left(\sum_{t=h+1}^T \hat{\alpha}_t \hat{\alpha}_{t-h}\right)$, where $g$ represents the number of lags determined in Step 3.
\hspace{.2in} 5. Calculate the serial correlation robust standard error, which is also known as the {\it heteroskedasticity-autocorrelation-corrected} (HAC) standard error because the calculation simultaneously adjusts the standard error for heteroskedasticity and autocorrelation. For variable $X_1$,
\[
se(\hat{\beta}_1)_{HAC} = \left(\frac{se(\hat{\beta}_1)}{\hat{\sigma}_{\varepsilon}}\right)^2 \sqrt{\hat{v}}.
\]
\hspace{.2in} 6. Repeat Steps 2 through 5 for independent variables $X_2$ through $X_p$.
\part{Discrete and Restricted Dependent Variables in Econometrics}
\section{Qualitative Dependent Variables}
{\bf The Linear Probability Model (LPM)}. A basic LPM can be expressed as $Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i$, where $Y$ is a dummy variable that is equal to 1 if a particular outcome is observed and 0 otherwise. As usual, the predicted value $\hat{Y} = \hat{\beta}_0 + \hat{\beta}_1 X$ is interpreted as an estimate of $E[Y|X]$.
\smallskip
Although OLS estimation always produces the typical R-squared measure of fit, its interpretation is less meaningful when all the values of the dependent variable are at 0 or 1. In the case of an LPM, more appropriate measures of fit capture the fraction of times the model predicts accurately:
$\bullet$ Accurate prediction defined as (a) $\hat{P}_i \ge 0.5$ and $Y=1$ or (b) $\hat{P}_i < 0.5$ and $Y=0$. Accurate predictions aggregated by calculating the total number of accurate predictions as a percentage of the total number of observations.
$\bullet$ Accurate prediction defined as (a) $\hat{P}_i \ge \overline{Y}$ and $Y=1$ or (b) $\hat{P}_i < \overline{Y}$ and $Y=0$. Accurate predictions aggregated by calculating the total number of accurate predictions as a percentage of the total number of observations.
$\bullet$ Accurate prediction defined as $\hat{P}_i \ge 0.5$ and $Y=1$ or $\hat{P}_i < 0.5$ and $Y=0$. Accurate predictions aggregated by calculating the percent of accurte predictions in each group (for $Y=0$ and $Y=1$) and weighting the percent of observations in each group.
$\bullet$ Accurate prediction defined as $\hat{P}_i \ge \overline{Y}$ and $Y=1$ or $\hat{P}_i < \overline{Y}$ and $Y=0$. Accurate predictions aggregated by calculating the percent of accurate predictions in each group (for $Y=0$ and $Y=1$) and weighting the percent of observations in each group.
\medskip
{\bf The three main LPM problems}.
$\bullet$ {\bf Non-normality of the error term}. The assumption that the error is normally distributed is critical for performing hypothesis tests. The error term of an LPM has a binomial distribution instead of a normal distribution. It implies that the traditional $t$-tests for individual significance and $F$-test for overall significance are invalid.
$\bullet$ {\bf Heteroskedasticity}. The assumption of homoskedasticity is required to prove that the OLS estimators are efficient. The presence of heteroskedasticity can cause the Gauss-Markov theorem to be violated and lead to other undesirable characteristics for the OLS estimators. The error term in an LPM is heteroskedastic because its variance isn't constant:
\[
Var(\varepsilon_i) = (\beta_0 + \beta_1 X_i) (1 - \beta_0 - \beta_1 X_i).
\]
$\bullet$ {\bf Unbounded predicted probabilities}.
\medskip
{\bf The probit and logit models}. In a probit or logit model, we estimate
\[
E[Y|X_i] = P(Y=1|X_i) = F(\beta_0 + \beta_1 X_i),
\]
where $F$ is a monotone increasing function with range $(0, 1)$. For a probit model, $F$ is the CDF of a standard normal: $F(x) = \frac{1}{\sqrt{2\pi}}\int_{-\infty}^x e^{-\xi^2/2}d\xi$; for a logit model, $F(x) = \frac{e^x}{1+e^x}$.
Probit and logit functions are both nonlinear in parameters, so OLS can't be used to estimate the $\beta$s. Instead, we use maximum likelihood estimation: we solve for
\[
\arg\max_{\beta_0, \beta_1} \left(\mbox{probability of observing $Y_1$, $\cdots$, $Y_n$}\right) = \arg\max_{\beta_0, \beta_1} \prod_{i=1}^n F(\beta_0+\beta_1 X_i)^{Y_i} [1-F(\beta_0+\beta_1 X_i)]^{1-Y_i}.
\]
Finding the optimal values for the $\hat{\beta}$ terms requires solving the following first-order conditions
\[
\begin{cases}
\frac{\partial \ln \hat{L}}{\partial \hat{\beta}_0} = \sum_{i=1}^n \left[\frac{Y_i F'(\hat{\beta}_0 + \hat{\beta}_1 X_i)}{F(\hat{\beta}_0 + \hat{\beta}_1 X_i)} - \frac{(1-Y_i)F'(\hat{\beta}_0 + \hat{\beta}_1 X_i)}{1-F(\hat{\beta}_0 + \hat{\beta}_1 X_i)} \right] = 0\\
\frac{\partial \ln \hat{L}}{\partial \hat{\beta}_1} = \sum_{i=1}^n \left[\frac{Y_i F'(\hat{\beta}_0 + \hat{\beta}_1 X_i)}{F(\hat{\beta}_0 + \hat{\beta}_1 X_i)} - \frac{(1-Y_i)F'(\hat{\beta}_0 + \hat{\beta}_1 X_i)}{1-F(\hat{\beta}_0 + \hat{\beta}_1 X_i)} \right] X_i = 0
\end{cases}
\]
Probit and logit estimation always produces a {\it Pseudo R-squared} measure of fit: $\widetilde{R}^2 = 1 - \frac{\ln \hat{L}_{ur}}{\ln \hat{L}_0}$, where $\ln L_{ur}$ is the log likelihood for the estimated model and $\ln L_0$ is the log likelihood in the model with only an intercept.
You can obtain more appropriate measures of fit for probit and logit models by comparing the model's predicted probabilities to the observed $Y$ values. Appropriate measures of fit typically capture the fraction of times the model accurately predicts the outcome, e.g. the four measures of fit used for the LPM.
\section{Limited Dependent Variable Models}
{\bf Limited dependent variables}.
\smallskip
$\bullet$ {\bf Censored dependent variables}. With a {\it censored dependent variable}, some of the actual values for the dependent variable are limited to a minimum and/or maximum threshold value. This leads to nonzero conditional mean of the error and correlation between the value of the error and the value of the independent variable.
\smallskip
$\bullet$ {\bf Truncated dependent variables}. With a {\it truncated dependent variable}, some of the values for the variables are missing (meaning they aren't observed if they are above or below some threshold). Sometimes observations included in the sample have missing values for both the independent and dependent variables, and in other cases only the values for the dependent variable are missing. Common scenarios resulting in truncation include {\it nonrandom sample selection} and {\it self-selection}.
Truncated data leads to nonzero conditional mean of the error and correlation between the value of the error and the value of the independent variable.
\smallskip
The primary difference between a truncated and a censored variable is that the value of a truncated variable isn't observed at all. However, a value is observed for a censored variable, but it's suppressed for some observations at the threshold point.
\medskip
{\bf Regression analysis for limited dependent variables}.
\smallskip
$\bullet$ {\bf Tobin's Tobit for censored dependent variables}. If you use OLS estimation with the observed data as if they're all uncensored values, you get biased coefficients. To avoid them, the estimation procedure must properly account for the censoring of the dependent variable. Maximum likelihood (ML) estimation does so.
Suppose you have the following model with upper-limit censoring (the most common type):
\[
Y_i^* = \beta_0 + \beta_1 X_i + \varepsilon_i, \; \varepsilon \sim N(0, \sigma^2_{\varepsilon}), \;
Y_i =
\begin{cases}
Y_i^* & Y_i^* < b \\
b & Y_i^* \ge b.
\end{cases}
\]
Using the probability of censorship, estimation is accomplished with ML, where the log likelihood function to be maximized is
\[
\ln L = \sum_{i=1}^n \left\{\ln F\left(\frac{\beta_0 + \beta_1 X_i - b}{\sigma_{\varepsilon}}\right) + \ln \left[\frac{1}{\sigma_{\varepsilon}} F'\left(\frac{Y_i - \beta_0 - \beta_1 X_i}{\sigma_{\varepsilon}}\right)\right]\right\}
\]
where $F$ denotes the standard normal CDF
Tobit estimation produces a likelihood ratio chi-squared statistic. It's analogous to the $F$-statistic in OLS, and it tests the null hypothesis that the estimated model doesn't produce a higher likelihood than a model with only a constant term.
\smallskip
$\bullet$ {\bf Truncated regression for truncated dependent variables with unobserved independent variables}. In this case, you can't apply OLS estimation to the observed data as if it's representative of the entire population. If you do, you'll wind up with biased coefficients. Instead, you need to use maximum likelihood (ML) estimation so you can properly account for the truncation by rescaling the normal distribution so that the cumulative probabilities add up to one over the restricted area.
Consider the following model
\[
Y_i^* = \beta_0 + \beta_1 X_i + \varepsilon_i, \; \varepsilon \sim N(0, \sigma^2_{\varepsilon}), \;
Y_i =
\begin{cases}
Y_i^* & Y_i^* < b \\
\cdot & Y_i^* \ge b.
\end{cases}
\]
The dot ($\cdot$) represents a missing value at and above the truncation point. Using a rescaling of the normal distribution, estimation is accomplished with ML, where the log likelihood function to be maximized is
\[
\ln L = -\frac{n}{2} \ln (2\pi \sigma_{\varepsilon}^2) - \frac{1}{2\sigma^2_{\varepsilon}} \sum_{i=1}^n (Y_i - \beta_0 - \beta_1 X_i)^2 - \sum_{i=1}^n \ln F\left(\frac{b - \beta_0 - \beta_1 X_i}{\sigma_{\varepsilon}}\right)
\]
where $F$ denotes the standard normal CDF.
Truncated normal estimation also produces a chi-squared statistic, which is like the $F$-statistic in OLS. It confirms or rejects the null hypothesis that the estimated model doesn't produce a higher likelihood than a model with only a constant term.
Ignoring the truncation and estimating the model using OLS will produce coefficients biased toward finding no relationship (smaller coefficients/effects).
\smallskip
$\bullet$ {\bf Heckman's selection bias correction for truncated dependent variables with observed independent variables}. Assume we work with the following model:
\[
Y_i^* = \beta_0 + \beta_1 X_i + \varepsilon_i, \; \varepsilon \sim N(0, \sigma^2_{\varepsilon})
\]
with self-selection defined by
\[
S_i = \gamma_0 + \gamma_1 W_{i1} + \gamma_2 W_{i2} + \cdots + u_i, \;
S_i = \begin{cases}
1 & \mbox{if $Y_i^*$ observed}\\
0 & \mbox{if $Y_i^*$ not observed},
\end{cases}
\; u \sim N(0,1), \; Corr(\varepsilon, u) = \rho.
\]
The log likelihood function that's maximized is
\begin{eqnarray*}
\ln L &=& \sum_{i=1}^n \left\{
\ln F \left[\frac{((\gamma_0+\gamma_1W_{i1}+\gamma_2W_{i2}+\cdots)+(Y_i^*-\beta_0-\beta_1X_i)\rho)/\sigma_{\varepsilon}}{\sqrt{1-\rho^2}}\right] \right.\\
& & \left. - \frac{1}{2}\left(\frac{Y_i^* - \beta_0 - \beta_1X_i}{\sigma_{\varepsilon}}\right)^2
- \ln(\sqrt{2\pi}\sigma_{\varepsilon}) + \ln F(-\gamma_0 -\gamma_1W_{i1} - \gamma_2W_{i2} - \cdots)
\right\}
\end{eqnarray*}
where $F$ denotes the standard normal CDF. In a Heckman model, the variables that influence truncation usually aren't identical to those that influence the value of the dependent variable (in contrast to the Tobit model, where they're assumed to be the same).
Sometimes the ML estimation fails to converge, and an alternative is to use the Heckit model. It can be accomplished by following these steps:
\hspace{.2in} 1. Estimate the selection equation $S_i = \gamma_0 + \gamma_1 W_{i1} + \gamma_2 W_{i2} + \cdots + u$ with a probit model.
\hspace{.2in} 2. Compute the inverse Mills ratio:
\[
\hat{\lambda}_i = \frac{F'(\hat{\gamma}_0 + \hat{\gamma}_1 W_{i1} + \hat{\gamma}_2 W_{i2} + \cdots)}{F(\hat{\gamma}_0 + \hat{\gamma}_1 W_{i1} + \hat{\gamma}_2 W_{i2} + \cdots)}
\]
where $F$ is the standard normal CDF.
\hspace{.2in} 3. Estimate the model $Y_i = \beta_0 + \beta_1 X_i + \beta_2 \hat{\lambda}_i + \varepsilon_i$ using the selected sample.
Estimation of a Heckman selection model also produces a chi-squared statistic, which is similar to the $F$-statistic in OLS and tests the null hypothesis that esttimated model doesn't produce a higher likelihood than a model with only a constant term.
\part{Extending the Basic Econometric Model}
\section{Static and Dynamic Models}
{\bf Using contemporaneous and lagged variables in regression analysis}.
\smallskip
$\bullet$ {\bf Problems with dynamic models}. When you're using time-series data, you can assume that the independent variables have a contemporaneous (static) or lagged (dynamic) effect on our dependent variable. A generic dynamic model is a {\it distributed lag model}. You can specify it as
\[
Y_t = \alpha + \delta_0 X_t + \delta_1 X_{t-1} + \delta_2 X_{t-2} + \cdots + \delta_r X_{t-r} + \varepsilon_t.
\]
In practice, distributed lag models can be plagued by estimation problems. The two most common issues are high multicollinearity and the loss of degrees of freedom: high multicollinearity usually causes the coefficient estimates to display erratic behavior, while loss of degrees of freedom increases the standard errors and reduces the chances of finding statistically significant coefficients.
A common solution to the estimation issues is to replace the lagged values of the independent variable with a lagged value of the dependent variable - an autoregressive model like $Y_t = \alpha + \delta X_t + \gamma Y_{t-1} + \varepsilon_t$. Using recursive substitution, we can show that the autoregressive model is equivalent to the distributed lag model.
The distributed lag estimates suffer from unpredictable shifts in the parameter estimates because they're plagued by high collinearity. Therefore, when estimating dynamic models, applied econometricians prefer the autoregressive model to the distributed lag model.
\smallskip
$\bullet$ {\bf Testing and correcting for autocorrelation in dynamic models}. Autocorrelation in a dynamic model causes the OLS coefficients to be biased. Because econometricians view biased coefficients to be more problematic than biased standard errors, testing for autocorrelation is essential if you're estimating a dynamic model. Turn to the Breusch-Godfrey test in this scenario. Avoid using the Durbin-Watson $d$ statistic when you're estimating a dynamic time-series model since in a dynamic model, the Durbin-Watson $d$ statistic is biased toward 2 (that is, finding no autocorrelation).
If you find evidence of autocorrelation, you can perform the preferred method of autocorrelation correction with dynamic models: feasible generalized least squares (FGLS).
\medskip
{\bf Projecting time trends with OLS}. If the dependent variable has a relatively steady increase over time, your best bet is to model the relationship with a linear time trend $Y_t = \alpha_0 + \alpha_1 t + \varepsilon_t$; if the growth rate is fairly steady, then you need to model the relationship with an exponential time trend $\ln Y_t = \alpha_0 + \alpha_1 t + \varepsilon_t$.
\smallskip
$\bullet$ {\bf Spurious correlation and time series}. If your regression model contains dependent and independent variables that are trending, then you end up with a {\it spurious correlation problem}. This is because if time significantly explains variation in the dependent variable and is also correlated with your independent variable, then you've excluded a relevant variable from your model and you'll overstate the explanatory power of your independent variables.
Adding some form of time trend component to your regression takes care of the spurious correlation problem.
\smallskip
$\bullet$ {\bf Detrending time-series data}. The main point of estimating a regression model with detrended data is to derive the explanatory power of the other independent variables. Here's how to obtain the goodness-of-fit, or R-squared, net of trend effects:
\hspace{.2in} 1. Regress your dependent variable on the trend variable to obtain the estimated function $Y_t = \hat{\alpha}_0 + \hat{\alpha}_1 t + \hat{\varepsilon}_{tY}$ and retain the residuals from this regression.
\hspace{.2in} 2. Regress each of your independent variables on the trend variable to obtain the estimated functions $X_{tk} = \hat{\alpha}_{0k} + \hat{\alpha}_{1k}t + \hat{\varepsilon}_{tX_k}$, where $k$ represents a specific independent variable, and retain the residuals from all $k$ of these regressions.
\hspace{.2in} 3. Regress the residuals obtained in Step 1 on the residuals obtained in Step 2 to estimate $\hat{\varepsilon}_{tY} = \beta_0 + \beta_1 \hat{\varepsilon}_{tX_k} + u_t$.
\hspace{.2in} The R-squared from this regression provides a better measure of fit when the time series exhibits extensive trending.
\medskip
{\bf Using OLS for seasonal adjustments}. The higher the frequency of an economic time series, the more likely it is to display seasoned patterns. The most common models capturing seasonal patterns include dummy variables representing the frequency with which the data were collected (usually quarter or month dummies): $Y_t = \alpha_0 + \alpha_1 S_1 + \alpha_2 S_2 + \cdots + \varepsilon_t$, where $S$ variables are your season dummy variables.
\smallskip
$\bullet$ {\bf Estimating seasonality effects}. Seasonally effects can be correlated with both your dependent and independent variables. If you include dummy variables for seasons along with the other relevant independent variables, you can simultaneously obtain better estimates of both seasonality and the effects of the other independent variables, and make more convincing arguments about the causal relationship between your independent variables and dependent variables:
\[
Y_t = \beta_0 + \sum_{i=1}^p \beta_i X_{ti} + \sum_{j=1}^q \lambda_i S_i + \varepsilon_t.
\]
\smallskip
$\bullet$ {\bf Deseasonalizing time-series data}.
\hspace{.2in} 1. Regress your dependent variable on the seasonal dummy variables to obtain the estimated function $Y_t = \hat{\alpha}_0 = \hat{\alpha}_0 + \sum_{j=1}^q\hat{\alpha}_j S_j + \hat{\varepsilon}_{tY}$ and retain the residuals from this regression.
\hspace{.2in} 2. Regress each of your independent variables on the seasonal dummy variables to obtain the estimated functions $X_{tk} = \hat{\alpha}_{0k} + \sum_{j=1}^q \hat{\alpha}_{jk} S_j + \hat{\varepsilon}_{tX_k}$, where $k$ represents a specific independent variable, and retain the residuals from all $k$ of these regressions.
\hspace{.2in} 3. Regress the residuals obtained in Step 1 on the residuals obtained in Step 2 to estimate $\hat{\varepsilon}_{tY} = \beta_0 + \beta_1 \hat{\varepsilon}_{tX_k} + u_t$.
\hspace{.2in} The R-squared from this regression provides a better measure of fit when the time series exhibits considerable seasonality.
Econometricians mainly estimate the regression model with deseasonalized data to derive the explanatory power of the other independent variables. Your primary econometric results, however, should report the estimates from the model with the raw data and season dummy variables.
\section{Diving into Pooled Cross-Section Analysis}
A pooled cross section combines independent cross-sectional data that has been collected over time. Typically, pooled cross sections contain many more cross-sectional observations than the number of time periods being pooled. Consequently, the models usually resemble cross-sectional analysis with possible heteroskedasticity corrections. Because the time gap between the collection of cross-sectional units is usually large, autocorrelation and other time-series issues tend to be ignored.
Do not confuse a pooled cross section with a panel dataset. In a panel dataset the same cross-sectional units are included in each time period rather than being randomly selected in each period.
\medskip
Including dummy variables in your model for each time period, except the {\it reference period}, allows you to identify changing parameter values:
\[
Y_i = \beta_0 + \beta_1 X_{i1} + \beta_2 X_{i2} + \cdots + \delta_1 R_{i1} + \delta_2 R_{i2} + \cdots + \varepsilon_i.
\]
By examining the statistical significance of the estimated $\delta$ (or $\hat\delta$) terms, you can identify any shifts (whether up or down) in the relationship for a given period.
Adding time-period dummy variables interacted with the other independent variables allows you to identify both changing intercepts and slopes:
\[
Y_i = \beta_0 + \beta_1 X_{i1} + \beta_2 X_{i2} + \cdots + \delta_0 R_i + \delta_1 (X_1 \cdot R)_i + \delta_2 (X_2 \cdot R)_i + \cdots + \varepsilon_i.
\]
If you're interested in any distributional change that may have occurred in your population of interest between time periods, you can perform an $F$-test of joint significance for all the $\delta (\delta_0, \delta_1, \delta_2, \cdots)$ parameters. Essentially, this test identifies whether the time period has a collective influence on the intercept and/or impact of the independent variables. It's equivalent to performing a Chow test for structural stability.
\section{Panel Econometrics}
Examples of well-known panel datasets include the National Longitudinal Surveys (NLS), the Panel Study of Income Dynamics (PSID), and the Survey of Income and Program Participation (SIPP).
\medskip
{\bf Estimating the uniqueness of each individual unit}. Suppose the model that explains your outcome of interest is
\[
Y_{it} = \beta_0 + \beta_1 X_{it} + \beta_2 w_{it} + \varepsilon_{it}
\]
where $X$ is an observable independent variable, and $\omega$ is an unobservable independent variable.
The danger with combining panel data and OLS estimation is that you may end up with results containing {\it heterogeneity bias}. The existence of unobservable factors that consistently impact your outcome of interest ($Y$ variable) is likely with panel data, which means you need to consider using one of three estimation methods:
\checkmark First difference (FD) transformation.
\checkmark Dummy variable (DV) regression.
\checkmark The fixed effects (FE) estimator (the method most commonly used by applied econometricians).
\medskip
{\it First difference (FD) transformation}. In order to use the FD approach, we rely on a couple of assumptions. First, we assume that the values for the unobserved variable remain constant through time for a given subject, but vary across subjects; $\omega_{it} = \omega_i$ $\forall t$. Second, we assume that the model doesn't change over time. Under these two assumptions, we can take the first difference (FD) of individual observations over time: $Y_{it} = \beta_0 + \beta_1 X_{it} + \beta_2 w_{it} + \varepsilon_{it}$ and $Y_{it-1} = \delta_0 + \beta_1 X_{it} + \beta_2 w_{it} + \varepsilon_{it}$, and obtain
\[
\Delta Y_i = Y_{it} - Y_{it-1} = (\beta_0 - \delta_0) + \beta_1(X_{it} - X_{it-1}) + \beta_2(\omega_{it}-\omega_{it-1}) + (\varepsilon_{it} - \varepsilon_{it-1}) = \alpha_0 + \beta_1 \Delta X_i + \Delta \varepsilon_i.
\]
\medskip
{\it Dummy variable (DV) regression}. A DV model can be represented as
\[
Y_{it} = \sum_{i=1}^n \alpha_{i0} A_i + \sum_{k=1}^p \beta_kX_{it,k} + \varepsilon_{it}
\]
where $A=1$ for any observation that pertains to individual $i$ and $0$ otherwise.
\medskip
{\it Fixed effects (FE) estimator}. FE estimation is applied by {\it time demeaning} the data. Demeaning deals with unobservable factors because it takes out any component that is constant over time. By assumption, that would be the entire amount of the unobservable variable. Typically, FE model also include {\it time effect} controls. You can add them by adding dummy variables for each time period in which cross-sectional observations were obtained.
\medskip
{\bf Increasing the efficiency of estimation with random effects}. If you have panel data, your econometric model can explicitly estimate the unobserved effects associated with your cross-sectional unit using the fixed effects (FE) model:
\[
Y_{it} = \beta_0 + \beta_1 X_{it} + \beta_2 \omega_{it} + \varepsilon_{it},
\]
where $\omega_{it} = \omega_i$ are unobserved characteristics for each cross-sectional unit that don't vary over time. On the other hand, your econometric model can allow all unobserved effects to be relegated to the error term by specifying the model as
\[
Y_{it} = \beta_0 + \beta_1 X_{it} + v_{it}
\]
where $v_it=\omega_{it} + \varepsilon_{it}$. This approach is known as the {\it random effects (RE) model}.
With panel data, the advantage of the RE model over the FE model is more efficient estimates of the regression parameters. The RE technique doesn't estimate the fixed effects separately for each cross-sectional unit, so you get fewer estimated parameters, increased degrees of freedom, and smaller standard errors. A critical assumption of the RE model is that the unobserved individual effect $(\omega_i)$ isn't correlated with the independent variable(s). In addition, for the homoskedasticity assumption to hold, we must also impose a constant variance on the individual effects.
Although $\varepsilon_{it}$ satisfies the classical linear regression model (CLRM) assumptions, the inclusion of $\omega_i$ in the composite error $v_{it} = \omega_i + \varepsilon_{it}$ results in a CLRM assumption violation. If you relegate the individual effects $(\omega_i)$ to the error term, you create positive serial correlation in the composite error. As a result, RE estimation requires feasible generalized least squares (FGLS) rather than OLS to appropriately eliminate serial correlation in the error term and to produce the correct standard errors and test statistics.
\medskip
{\bf Testing efficiency against consistency with the Hausman test}. The RE model produces more efficient estimates than the FE model. However, if individual fixed effects are correlated with the independent variable(s), then the RE estimates will be biased. In that case, the FE estimates would be preferred. The Hausman test checks the RE assumptions and helps you decide between RE and FE estimation. Note if heteroskedasticity is present, the Hausman test results could be misleading.
In a model with one independent variable, the Haussman test statistic is defined as
\[
H = \frac{(\hat{\beta}_{1(FE)} - \hat{\beta}_{1(RE)})^2}{\sigma^2_{\hat{\beta}_{1(FE)}} - \sigma^2_{\hat{\beta}_{1(RE)}}} \sim \chi_1^2
\]
\part{The Part of Tens}
\section{Ten Components of a Good Econometrics Research Project}
$\bullet$ {\it Introducing Your Topic and Posing the Primary Question of Interest}.
\smallskip
$\bullet$ {\it Discussing the Relevance and Importance of Your Topic}.
\smallskip
$\bullet$ {\it Reviewing the Existing Literature}. Sources for references include
\hspace{.2in} \checkmark Google Scholar (\href{http://scholar.google.com}{scholar.google.com}) lets you search by keyword.
\hspace{.2in} \checkmark Social Science Research Network (\href{http://www.ssrn.com}{www.ssrn.com}) contains a repository of working papers with the latest research findings.
\hspace{.2in} \checkmark Economic Journals on the web (\href{http://www.oswego.edu/~economic/journals.htm}{http://www.oswego.edu/$\sim$economic/journals.htm}) provides a list of economic journals.
\hspace{.2in} \checkmark EconLit (\href{http://www.aeaweb.org/econlit/}{www.aeaweb.org/econlit/}) lists sources of economic research and is available through most electronic resources of university libraries.
\smallskip
$\bullet$ {\it Describing the Conceptual or Theoretical Framework}. One of the characteristics that differentiates applied research in econometrics from other applications of statistical analysis is a theoretical structure supporting the empirical work, rather than focus only on the statistical fit between variables.
\smallskip
$\bullet$ {\it Explaining Your Econometric Model}. You should explain and justify any specification characteristics of the econometric model (logs, quadratic functions, qualitative dependent variables, and so on) that aren't directly addressed by the conceptual framework. This can be achieved with intuition, scatter plots, and/or conventions derived by researchers in previously published work.
If there are contesting theories, then you should explain whether this implies that you could end up with different estimates of the relationship between the variables in a single model or if you should estimate more than one model.
\smallskip
$\bullet$ {\it Discussing the Estimation Method(s)}. Estimation problems arising from a failure of the CLRM assumptions are common in applied econometric research. It's usually a good idea to estimate your model using OLS to obtain baseline results, even if you ultimately decide to use a different estimation technique. You may find that the results are similar and OLS is the easiest to interpret.
\smallskip
$\bullet$ {\it Providing a Detailed Description of Your Data}.
\hspace{.2in} \checkmark How the dataset was acquired and its source(s)
\hspace{.2in} \checkmark The nature of the data (cross sectional, time series, or panel)
\hspace{.2in} \checkmark The time span covered by the data
\hspace{.2in} \checkmark How and with what frequency the data was collected
\hspace{.2in} \checkmark The number of observations present
\hspace{.2in} \checkmark Whether any observations were thrown out and why
\hspace{.2in} \checkmark Summary statistics for any variables used in your econometric model(s)
\smallskip
$\bullet$ {\it Constructing Tables and Graphs to Display Your Results}.
\smallskip
$\bullet$ {\it Interpreting the Reported Results}. Reporting your econometric results is not enough; you also need to decipher the results for your readers. The most important element is the evaluation of statistical significance and magnitude for the primary variables of interest. The discussion should include an explanation of magnitude, directionality (positive/negative effects), statistical significance, and the relationship with the research question and theoretical hypotheses posed earlier in your paper.
\smallskip
$\bullet$ {\it Summarizing What You Learned}. Synthesize your results and explain how they're connected to your primary question. Avoid
\hspace{.2in} \checkmark focusing on variables with coefficients that are statistically significant even when the magnitude of their effect on the dependent variable is negligible (nearly no effect);
\hspace{.2in} \checkmark ignoring variables with statistically insignificant coefficients--finding no-relationship between variables is important when economic theory or the prevailing wisdom says differently.
\section{Ten Common Mistakes in Applied Econometrics}
$\bullet$ {\it Failing to Use Your Common Sense and Knowledge of Economic Theory}. One of the characteristics that differentiate applied research in econometrics from other applications of statistical analysis is the use of economic theory and common sense to motivate the connection between the independent and dependent variables.
\smallskip
$\bullet$ {\it Asking the Wrong Questions First}. Conceptual questions are more important to ask than technical ones.
\smallskip
$\bullet$ {\it Ignoring the Work and Contributions of Others}.
\smallskip
$\bullet$ {\it Failing to Familiarize Yourself with the Data}. Do some exploratory work that includes descriptive statistics, line charts (for time-series data), frequency distributions, and even listing of some individual data values. Notable issues include
\hspace{.2in} \checkmark Variables you thought were measured continuously are actually in categories or groups.
\hspace{.2in} \checkmark Measurements that you believed were real values are actually missing values.
\hspace{.2in} \checkmark Data values that appear perfectly legitimate are actually censored values.
\smallskip
$\bullet$ {\it Making It Too Complicated}. The art of econometrics lies in finding the appropriate specification or functional form to model your particular outcome of interest. Given the uncertainty of choosing the ``perfect" specification, many applied econometricians make the mistake of overspecifying their models or favor complicated estimation methods over more straightforward techniques. If theory and common sense aren't fairly conclusive about the hypothesized effect of a variable, it's probably best to refrain from including it. Consequently, additional sophistication in your model should be introduced as necessary and not simply to exhibit your econometric skills.
\smallskip
$\bullet$ {\it Being inflexible to Real-World Complications}. The {\it ceteris paribus} assumption often does not hold. Use proxies that seem appropriate and that others would find acceptable. Avoid forcing a particular dataset into estimation that isn't appropriate for the research question.
\smallskip
$\bullet$ {\it Looking the Other Way When You See Bizarre Results}. If some results don't pass a common-sense test, then the statistical tests are likely to be meaningless and may even indicate that you've made a mistake with your variables, the estimation technique, or both.
\smallskip
$\bullet$ {\it Obsessing over Measures of Fit and Statistical Significance}. The importance of your results shouldn't be determined on the basis of fit (R-squared values) or statistical significance alone. The primary finding in many of the best papers using econometrics involves findings of statistical insignificance.
\smallskip
$\bullet$ {\it Forgetting about Economic Significance}. The most important element in the discussion of your results is the evaluation of statistical significance {\it and} magnitude for the primary variables of interest. If a variable has a statistically significant coefficient but the magnitude is too small to be of any importance, then you should be clear about its lack of economic significance.
\smallskip
$\bullet$ {\it Assuming Your Results Are Robust}. You want to perform robustness (or sensitivity) analysis to show that your model estimates aren't sensitive (are robust) to slight variations in specification.
\part{Appendices}
\begin{appendix}
\section{Specifying Your Econometrics Regression Model}
As you define your regression model, you need to consider several elements:
$\bullet$ Economic theory, intuition, and common sense should all motivate your regression model.
$\bullet$ The most common regression estimation technique, ordinary least squares (OLS), obtains the best estimates of your model if the classical linear regression model (CLRM) assumptions hold.
$\bullet$ Assuming a normal distribution of the error term is important for hypothesis testing and prediction/forecasting.
When a regression model is estimated, prior to obtaining results, you need to provide a sound justification for the variables you've chosen.
The characteristics of the error term are of critical importance in econometrics. The assumption that the error term is normally distributed isn't required for performing OLS estimation, but it is necessary when you want to produce {\it confidence intervals} and/or perform {\it hypothesis tests} with your OLS estimates.
\section{Choosing the Functional Form of Your Regression Model}
$\bullet$ Take the time to think through specification issues methodically.
$\bullet$ Explain why you've chosen specific independent variables for your model.
$\bullet$ Justify the functional form you've chosen for the model.
$\bullet$ Test the assumptions of the classical linear regression model (CLRM) and make change sot the model as necessary.
$\bullet$ Spend some time examining the sensitivity of your results by making slight modifications to the variables and the functional form of the relationship.
\section{Working with Special Dependent Variables in Econometrics}
Like qualitative variables, the limited (censored or truncated) values cause the distributional assumptions of the classical linear regression model to fail. Fortunately, econometricians have developed techniques to handle restricted/limited dependent variables that are similar to those used for qualitative dependent variables.
The following list contains special dependent variable situation and the names of the techniques econometricians have developed to handle them:
$\bullet$ {\bf Dichotomous or binary response dependent variable:} A discrete variable with two outcomes, usually 0 or 1. Handled with {\it Probit/Logit models}.
$\bullet$ {\bf Censored dependent variable:} A continuous variable where some of the actual values have been limited to some predetermined minimum or maximum value. Handled with the {\it Tobit (censored normal) model}.
$\bullet$ {\bf Truncated dependent variable:} A continuous variable where some of the actual values aren't observed if they are less than some predetermined minimum value or more than some predetermined maximum value. Handled with the {\it truncated normal model}.
$\bullet$ {\bf Self-selected sample:} Missing values for the dependent variable due to nonrandom participation decisions from population of interest. Handled with the {\it Heckman selection model}.
$\bullet$ {\bf Polychotomous or a multiple response dependent variable:} A discrete variable with more than two outcomes. Handled with a {\it multinomial Probit/Logit model} or {\it ordered Probit/Logit model} (covered in more advanced econometrics courses).
$\bullet$ {\bf Discrete dependent variable:} A nonnegative, discrete count variable that assumes integer values (0, 1, 2, $\cdots$). Handled with a {\it Poisson model} or {\it negative binomial model} (covered in more advanced econometrics courses).
\section{Choose a Forecasting Method in Econometrics}
\begin{figure}[h]
\centering
% Set the overall layout of the tree
\tikzstyle{level 1}=[level distance=2.5cm, sibling distance=3.5cm, ->]
\tikzstyle{level 2}=[level distance=4.5cm, sibling distance=2.5cm, ->]
\tikzstyle{level 3}=[level distance=4.5cm, sibling distance=3.5cm, ->]
% Define styles for bags and leafs
\tikzstyle{bag} = [text width=7em, text centered]
\tikzstyle{end} = [text width=10em, text centered]
\begin{tikzpicture}[grow=right, sloped]
\node[bag]{Forecasting}
child {
node[bag]{Qualitative}
child {
node[end]{Expert consensus \\ Scenario analysis}
}
}
child {
node[bag]{Quantitative}
child {
node[bag]{Smoothing}
child {
node[end]{Exponential \\ Autoregressive (AR) \\ Moving average (MA) \\ ARIMA}
}
}
child {
node[bag]{Causal}
child {
node[end]{Independent variables \\ Trending \\ Seasonality}
}
}
}
;
\end{tikzpicture}
\end{figure}
\section{{\it Econometrics for Dummies} Cheat Sheet}
\subsection{The CLRM assumptions}
Assumptions of the classical linear regression model (CLRM):
$\bullet$ The model parameters are linear.
$\bullet$ The values for the independent variables are derived from a random sample of the population, and they contain variability.
$\bullet$ The explanatory variables don't have perfect collinearity.
$\bullet$ The error term has zero conditional mean.
$\bullet$ The model has no heteroskedasticity.
$\bullet$ The model has no autocorrelation.
Under the above assumptions, the ordinary least squares (OLS) generates the optimal results (Gauss-Markov theorem).
\subsection{Useful formulas in econometrics}
{\bf Regression coefficients in a model with one independent variable:}
\[
\hat \beta_1 = \frac{\sum_{i=1}^n (Y_i - \overline{Y})(X_i-\overline{X})}{\sum_{i=1}^n(X_i-\overline{X})^2}, \; \hat \beta_0 = \overline{Y} - \hat \beta_1 \overline{X}.
\]
{\bf Standard error of the estimate or mean squared error:}
\[
\hat \sigma_{\varepsilon} = \sqrt{\frac{\sum_{i=1}^n \hat \varepsilon_i^2}{n-p-1}}.
\]
{\bf Standard error of regression coefficients in a model with one independent variable:}
\[
\hat \sigma_{\hat \beta_1} = \frac{\hat \sigma_{\varepsilon}}{\sqrt{\sum_{i=1}^n(X_i-\overline{X})^2}}, \;
\hat \sigma_{\hat \beta_0} = \sqrt{\frac{\sum_{i=1}^n X_i^2}{n\sum_{i=1}^n(X_i-\overline{X})^2}} \cdot \hat\sigma_{\varepsilon}.
\]
{\bf Explained sum of squares (ESS), residual sum of squares (RSS), and total sum of squares (TSS):}
\[
ESS = \sum_{i=1}^n (\hat Y_i - \overline{Y})^2, \;
RSS = \sum_{i=1}^n (Y_i - \hat Y_i)^2 = \sum_{i=1}^n \hat \varepsilon_i^2, \;
TSS = \sum_{i=1}^n (Y_i - \overline{Y})^2 = ESS + RSS.
\]
{\bf Coefficient of determination; $R$-squared:}
\[
R^2 = \frac{ESS}{TSS} = 1-\frac{RSS}{TSS}.
\]
{\bf $t$-statistic for regression coefficients:}
\[
t = \frac{\hat \beta_k}{\hat\sigma_{\hat\beta_k}}.
\]
{\bf Confidence interval for regression coefficients:}
\[
\hat \beta_k \pm t_{\alpha/2} \cdot \hat \sigma_{\hat \beta_k}
\]
\subsection{Common functional forms for regression}
{\bf Quadratic functions:}
\[ Y_i = \beta_0 + \beta_1 X_i + \beta_2 X_i^2 + \varepsilon_i.\]
{\bf Cubic functions:}
\[ Y_i = \beta_0 + \beta_1 X_i + \beta_2 X_i^2 + \beta_3 X_i^3 + \varepsilon_i. \]
{\bf Inverse functions:}
\[ Y_i = \beta_0 + \beta_1 \frac{1}{X_i} + \varepsilon_i.\]
{\bf Log-log functions:}
\[ \ln Y_i = \beta_0 + \beta_1 \ln X_i + \varepsilon_i.\]
{\bf Log-linear functions:}
\[ \ln Y_i = \beta_0 + \beta_1 X_i + \varepsilon_i.\]
{\bf Linear-log functions:}
\[ Y_i = \beta_0 + \beta_1 \ln X_i + \varepsilon_i.\]
\subsection{Typical problems estimating econometric models}
{\bf High multicollinearity}.
$\bullet$ Definition: two or more independent variables in a regression model exhibit a close linear relationship.
$\bullet$ Consequences: large standard errors and insignificant $t$-statistics, coefficient estimates sensitive to minor changes in model specification, and nonsensical coefficient signs and magnitudes.
$\bullet$ Detection: pairwise correlation coefficients and variance inflation factor (VIF).
$\bullet$ Solution: 1. collect additional data; 2. re-specify the model; 3. drop redundant variables.
\medskip
{\bf Heteroskedasticity}.
$\bullet$ Definition: the variance of the error term changes in response to a change in the value of the independent variables.
$\bullet$ Consequences: inefficient coefficient estimates, biased standard errors, and unreliable hypothesis tests.
$\bullet$ Detection: Park test, Goldfeld-Quandt test, Breusch-Pagan test, and White test.
$\bullet$ Solution: 1. weighted least squares (WLS); 2. robust standard errors.
\medskip
{\bf Autocorrelation}.
$\bullet$ Definition: an identifiable relationship exists between the values of the error in one period and the values of the error in another period.
$\bullet$ Consequences: inefficient coefficient estimates, biased standard errors, and unreliable hypothesis tests.
$\bullet$ Detection: Geary or runs test, Durbin-Watson test, and Breusch-Godfrey test.
$\bullet$ Solution: 1. Cochrane-Orcutt transformation; 2. Prais-Winstein transformation; 3. Newey-West robust standard errors.
\end{appendix}
\begin{thebibliography}{99}
\bibitem{Brooks08} Chris Brooks. {\it Introductory Econometrics for Finance}, 2ed.. New York, Cambridge University Press, 2008.
\bibitem{JWHT14} Gareth James, Daniela Witten, Trevor Hastie, and Robert Tibshirani. {\it An introduction to statistical learning : with applications in R}. New York, Springer, 2014.
\bibitem{Pedace13a} Roberto Pedace. {\it Econometrics for dummies}. Hoboken, John Wiley \& Sons Inc., 2013.
\bibitem{Pedace13b} Roberto Pedace. {\it Econometrics for Dummies} extras. \href{http://www.dummies.com/how-to/education-languages/Economics/Econometrics/Econometrics-For-Dummies-Extras.html}{www.dummies.com/extras/econometrics}.
\end{thebibliography}
\end{document}
% Version 1.0.3, 2016-08-04: added intuition of F-test for overall significance.
% Version 1.0.2, 2016-05-01: added notes of Chapter 16, 17.
% Version 1.0.1, 2015-12-26: clarification of the error term in regression equation.
% Version 1.0, 2015-12-23: added notes of Chapter 11-15.
% Version 0.0.2, 2015-03-09: added notes of Chapter 10.
% Version 0.0.1, 2015-02-24: first draft.