A blog for learning

Introduction to Panel Threshold Model

  • Part One: Slides

slide 1

slide 2

slide 3

slide 4

slide 5

slide 6

slide 7

slide 8

slide 9

slide 10

slide 11

slide 12

slide 13

slide 14

slide 15

slide 16

slide 17

slide 18

slide 19

slide 20

slide 21

slide 22

slide 23

slide 24

slide 25

slide 26

slide 27

slide 28

slide 29

slide 30

  • Part Two: R code for this lecture
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    #First of all, we need to generate the data of y,x,n
    set.seed(123) # I set the seed for the sake of repeatability
    e=rnorm(100,mean=0,sd=1)
    x=rnorm(100,mean=0,sd=3^2)
    n=1:100
    y=rep(0,times=100)
    y[1:50]=1+2*x[1:50]+e[1:50]
    y[51:100]=1-2*x[51:100]+e[51:100]
    data=data.frame(n,x,y,e)
    #Then we can plot out the relationship between y and n, as well as y and x.
    library(ggplot2)
    p1=ggplot(data,aes(x=n,y=y))+geom_point()
    p2=ggplot(data,aes(x=x,y=y))+geom_point()
    p1
    p2
    #Regression results
    #Suppose we do not aware the existance of thereshold effect
    reg1=lm(y~x,data)
    summary(reg1)
    #If we know the thereshold and seprate the whole data frame into two groups,
    #then conduct regressions separately.
    data1=subset(data, n<=50)
    data2=subset(data,n>50)
    reg2=lm(y~x,data1)
    reg3=lm(y~x,data2)
    summary(reg2)
    summary(reg3)
    #If we do not know where the thereshold is, then how?
    reg=list()
    rss=array()
    for (i in 1:99)
    {
    dum=x
    dum[(i+1):100]=0
    reg[[i]]=lm(y~x+dum)
    rss[i]=sum(residuals(reg[[i]])^2)
    }
    order(rss)
    Rss=data.frame(n=1:99,rss)
    ggplot(Rss,aes(x=n,y=rss))+geom_point()
    ######################
    ######################
    #Now if we have two theresholds, we can still use
    #nested loop to discern these two theresholds.
    #However Hensen proposed a more elegant solution.
    #Prepare the data
    set.seed(456) # I set the seed for the sake of repeatability
    e=rnorm(100,mean=0,sd=1)
    x=rnorm(100,mean=0,sd=3^2)
    n=1:100
    y=rep(0,times=100)
    y[1:30]=1+2*x[1:30]+e[1:30]
    y[31:60]=1-2*x[31:60]+e[31:60]
    y[61:100]=1+4*x[61:100]+e[61:100]
    data=data.frame(n,x,y,e)
    #Then we can plot out the relationship between y and n, as well as y and x.
    library(ggplot2)
    p1=ggplot(data,aes(x=n,y=y))+geom_point()
    p2=ggplot(data,aes(x=x,y=y))+geom_point()
    p1
    p2
    #Now we use the method proposed by Hensen to discern theresholds
    #pinpoint the first thereshold
    reg=list()
    rss1=array()
    for (i in 1:99)
    {
    dum=x
    dum[(i+1):100]=0
    reg[[i]]=lm(y~x+dum)
    rss1[i]=sum(residuals(reg[[i]])^2)
    }
    which.min(rss1)
    #plot the figure
    Rss1=data.frame(n=1:99,rss1)
    ggplot(Rss1,aes(x=n,y=rss1))+geom_point()
    #pinpoint the second thereshold
    reg2=list()
    rss2=array()
    for(i in 1:99)
    {
    dum1=x
    dum2=x
    left=min(i,which.min(rss1))
    right=max(i,which.min(rss1))
    dum1[(left+1):100]=0
    dum2[1:right]=0
    reg2[[i]]=lm(y~x+dum1+dum2)
    rss2[i]=sum(residuals(reg2[[i]])^2)
    }
    which.min(rss2)
    #plot the figure
    Rss2=data.frame(n=1:99,rss2)
    ggplot(Rss2,aes(x=n,y=rss2))+geom_point()
    #pinpoint the first thereshold again
    reg3=list()
    rss3=array()
    for(i in 1:99)
    {
    dum1=x
    dum2=x
    left=min(i,which.min(rss2))
    right=max(i,which.min(rss2))
    dum1[(left+1):100]=0
    dum2[1:right]=0
    reg3[[i]]=lm(y~x+dum1+dum2)
    rss3[i]=sum(residuals(reg3[[i]])^2)
    }
    which.min(rss3)
    #plot the figure
    Rss3=data.frame(n=1:99,rss3)
    ggplot(Rss3,aes(x=n,y=rss3))+geom_point()
    # put those aboving figures into one picture.
    all=rbind(data.frame(n=1:99,rss=rss1,t="loop 1"),data.frame(n=1:99,rss=rss2,t="loop 2"),data.frame(n=1:99,rss=rss3,t="loop 3"))
    p=ggplot(all, aes(x=n,y=rss))
    p+geom_path(aes(position=t,color=t))+geom_point(aes(position=t,color=t))
    #bootstrap, in this case, we only consider one threshold.
    set.seed(123) # I set the seed for the sake of repeatability
    e=rnorm(100,mean=0,sd=1)
    x=rnorm(100,mean=0,sd=3^2)
    n=1:100
    y=rep(0,times=100)
    y[1:50]=1+2*x[1:50]+e[1:50]
    y[51:100]=1-2*x[51:100]+e[51:100]
    data=data.frame(n,x,y,e)
    breg1=lm(y~x,data)
    s0=sum(residuals(breg1)^2)
    library(boot)
    fvalue=function(data,indices){
    d=data[indices,]
    bregloop=list()
    brss=array()
    for (i in 1:99)
    {
    dum=d$x
    dum[(i+1):100]=0
    bregloop[[i]]=lm(y~x+dum,data=d)
    brss[i]=sum(residuals(bregloop[[i]])^2)
    }
    a=which.min(brss)
    dum=d$x
    dum[(a+1):100]=0
    breg2=lm(y~x+dum,data=d)
    s1=sum(residuals(breg2)^2)
    f=(s0/s1-1)*(100-1)
    return(f)
    }
    result=boot(data=data,fvalue,R=99)
    f=result$t
    result=boot(data=data,fvalue,R=99,formula1=y~x+dum)
    f=result$t
    #the real f0
    dum=data$x
    dum[51:100]=0
    reg4=lm(y~x+dum,data)
    rss4=sum(residuals(reg4)^2)
    f0=(s0/rss4-1)*(100-1)
    table(f>f0) # so the p-value=0, we should reject H0
    ##last question, get the confidence intervals for gamma
    LR=(rss-rss4)/(rss4/(100-1))
    order(LR)
    c=-2*log(1-sqrt(1-0.01)) #we set the asymptotic level alpha at 1%
    LR[LR<c]

Python Quiz

  • If you need to read the original questions, please visit the website of Quantitative Economics .
  • Warning: This is my practice code for Quiz, I do not guarantee the accuracy of my answer

Largest palindrome product

  • A palindromic number reads the same both ways. The largest palindrome made from the product of two 2-digit numbers is 9009 = 91 × 99.
  • Find the largest palindrome made from the product of two 3-digit numbers.
1
2
3
4
5
6
7
8
9
10
11
12
13
result=[]
for x in range(0, 1000):
for y in range(0, 1000):
num = x*y
rnum = int(str(num)[::-1]) #used to change the place of number
#print(num)
#print(rnum)
if num==rnum:
result1=num,x,y
result.append(result1)
#print(result)
sorted_result=sorted(result,key=lambda t:t[0],reverse=1) #use sorted function
print(sorted_result[1])

10001st prime

  • By listing the first six prime numbers: 2, 3, 5, 7, 11, and 13, we can see that the 6th prime is 13.
  • What is the 10001st prime number?
1
2
3
4
5
6
7
8
9
10
11
result=[]
import math
for n in range(3,10000):
if 0 not in [n%i for i in range(2, int(math.sqrt(n))+1)]:
result.append(n)
print(result[1000])
# cannot use the usual loop, like:
# for i in range(2, int(math.sqrt(n))+1):
# if n%i != 0: result.append(n)
# this code would lead to repeat count and the outcome is also incorrect.

Special Pythagorean triplet

  • A Pythagorean triplet is a set of three natural numbers, a < b < c, for which, a^2+b^2=c^2
  • There exists exactly one Pythagorean triplet for which a+b+c=1000. Find the product abc.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# primary method
for a in range(1,1000):
for b in range(a,1000):
for c in range(b,1000):
if a*a+b*b==c*c and a+b+c==1000:
print(a,b,c)
# more elegant method
for a in range(3,1000):
if a%2==0:
r=range(2,int(a/2)+2,2)
else:
r=range(1,int(a/2)+1,2)
for x in r:
b=(a*a-x*x)/(2*x)
c=(a*a+x*x)/(2*x)
if b==int(b)and a+int(b)+int(c)==1000:
print(a,int(b),int(c))

Largest product in a series

  • The four adjacent digits in the 1000-digit number that have the greatest product are 9 × 9 × 8 × 9 = 5832.

731671765313306249192251196744265747423553491949349698352031277450632623957831 801698480186947885184385861560789112949495459501737958331952853208805511125406 987471585238630507156932909632952274430435576689664895044524452316173185640309 871112172238311362229893423380308135336276614282806444486645238749303589072962 904915604407723907138105158593079608667017242712188399879790879227492190169972 088809377665727333001053367881220235421809751254540594752243525849077116705560 136048395864467063244157221553975369781797784617406495514929086256932197846862 248283972241375657056057490261407972968652414535100474821663704844031998900088 952434506585412275886668811642717147992444292823086346567481391912316282458617 866458359124566529476545682848912883142607690042242190226710556263211111093705 442175069416589604080719840385096245544436298123098787992724428490918884580156 166097919133875499200524063689912560717606058861164671094050775410022569831552 0005593572972571636269561882670428252483600823257530420752963450

  • Find the thirteen adjacent digits in the 1000-digit number that have the greatest product. What is the value of this product?
1
2
3
4
5
6
7
8
9
10
11
12
line=str('7316717653133062491922511967442657474235534919493496983520312774506326239578318016984801869478851843858615607891129494954595017379583319528532088055111254069874715852386305071569329096329522744304355766896648950445244523161731856403098711121722383113622298934233803081353362766142828064444866452387493035890729629049156044077239071381051585930796086670172427121883998797908792274921901699720888093776657273330010533678812202354218097512545405947522435258490771167055601360483958644670632441572215539753697817977846174064955149290862569321978468622482839722413756570560574902614079729686524145351004748216637048440319989000889524345065854122758866688116427171479924442928230863465674813919123162824586178664583591245665294765456828489128831426076900422421902267105562632111110937054421750694165896040807198403850962455444362981230987879927244284909188845801561660979191338754992005240636899125607176060588611646710940507754100225698315520005593572972571636269561882670428252483600823257530420752963450')
list=[]
result=[]
for i in line:
list.append(int(i))
for i in range(0,987):
tempt=list[i]*list[i+1]*list[i+2]*list[i+3]*list[i+4]*list[i+5]*list[i+6]*list[i+7]*list[i+8]*list[i+9]*list[i+10]*list[i+11]*list[i+12]
result1=tempt,list[i],list[i+1],list[i+2],list[i+3],list[i+4],list[i+5],list[i+6],list[i+7],list[i+8],list[i+9],list[i+10],list[i+11],list[i+12]
#print(result1)
result.append(result1)
sorted_result=sorted(result,key=lambda t:t[0],reverse=1)
print(sorted_result[1])

Summation of primes

  • The sum of the primes below 10 is 2 + 3 + 5 + 7 = 17.
  • Find the sum of all the primes below two million.
1
2
3
4
5
6
import math
result=2
for n in range(3,2000000):
if 0 not in [n%i for i in range(2, int(math.sqrt(n))+1)]:
result=result+n
print(result)

I am pretty sure this is not the most effective method. The code is still robust on the data scale of 200000, but I cannot get the result if the data scale expands to 2000000. I am still looking for some alternative methods.

R Introduction Part 3

  • This is my lecture note written for undergraduate R programming course, which is a supplementary course of the Advanced Macroeconomics course.
  • Most figures and tables are extracted from R in Action.
  • All exercises are copied from my teacher Jing Fang‘s R Workshop, you can find the data for exercises on the website of R Workshop.

Nonlinear Regression

What is Nonlinear Regression?

In statistics, nonlinear regression is a form of regression analysis in which observational data are modeled by a function which is a nonlinear combination of the model parameters and depends on one or more independent variables.
1.Linear Regression
$$Y_i=\beta_1+\beta_2X_i+\mu_i$$
2.Nonlinear Regression
$$Y_i=\beta_1e^{\beta_2X_i}+\mu_i$$

How to cope with Nonlinear Regression?

1.Delta Method

  • Single parameter situation
    If we know the standard error of parameter $\hat{\theta}$, however, what we are really interested is the standard error of parameter $\gamma=g(\theta)$, how to estimate its standard error?
    1.If $g(\theta)$ is a linear function, you know how to get it.
    2.If $g(\theta)$ is not a linear function, then:
    We assume the standard error of $\hat{\theta}$ is $S_{\theta}$, and the standard error of $\hat{\gamma}$ is $S_{\gamma}$, the relationship between them is
    $$S_{\gamma}\equiv|g^\prime(\hat{\theta})|S_{\theta}$$
  • Multi-parameters situation
    If $\theta$ and $\gamma$ are both vectors, then how? We assume the first one is a $k$-dimension vector and the latter is a $l$-dimension vector, $l\leqq k$. $\gamma$ is a function of $\theta$: $\gamma=g(\theta)$, where $g(\theta)$ is a monotonic and continuous $l$-dimension vector function. Then we can get the covariance matrix of $\hat{\gamma}$ using the following equation:
    $$\widehat{Var}(\hat{\gamma})\equiv\hat{G}\widehat{Var}(\hat{\theta})\hat{G}^T \qquad (1)$$
    where $\widehat{Var}(\hat{\theta})$ is the estimation of covariance matrix, $\hat{G}$ is Jacoby matrix.
    Then, how to get $\widehat{Var}(\hat{\theta})$ ?
    Regardless of the form of the error covariance matrix $\Omega$, the covariance matrix of $\hat{\theta}$ equals to:
    $$\widehat{Var}(\hat{\theta})=E((\hat{\theta}-\theta_0)(\hat{\theta}-\theta_0)^T)=(X^TX)^{-1}X^T{\Omega}X(X^TX)^{-1} \qquad (2)$$
    where $X$ is the matrix of independent variables, $\Omega$ is error covariance matrix.

Example
Suppose we have the following regression:
$$Y_i=\theta_0+\frac{\alpha}{1-\alpha}X_1+\frac{1}{2}\frac{\sigma-1}{\sigma}\frac{\alpha}{(1-\alpha)^2}X_2$$
So, in this case we can get $\theta=\begin{pmatrix} \alpha \\ \sigma \end{pmatrix}$    $\gamma=\begin{pmatrix} \frac{\alpha}{1-\alpha} \\ \frac{1}{2}\frac{\sigma-1}{\sigma}\frac{\alpha}{(1-\alpha)^2} \end{pmatrix}$
$$\gamma=g(\theta) \Rightarrow \begin{cases}\gamma_1=g_1(\alpha,\sigma)\\ \gamma_2=g_2(\alpha,\sigma) \end{cases}$$
Construction of Jacoby matrix
$$\hat{G} = \begin{bmatrix}
\frac{\partial\gamma_1}{\partial\theta_1} & \frac{\partial\gamma_1}{\partial\theta_2} \\
\frac{\partial\gamma_2}{\partial\theta_1} & \frac{\partial\gamma_2}{\partial\theta_2}
\end{bmatrix}$$
So,
$$ \hat{G} = \begin{bmatrix}
\frac{\partial(\frac{\alpha}{1-\alpha})}{\partial\alpha} & \frac{\partial(\frac{\alpha}{1-\alpha})}{\partial\sigma} \\
\frac{\partial(\frac{1}{2}\frac{\sigma-1}{\sigma}\frac{\alpha}{(1-\alpha)^2})}{\partial\alpha} & \frac{\partial(\frac{1}{2}\frac{\sigma-1}{\sigma}\frac{\alpha}{(1-\alpha)^2})}{\partial\sigma}
\end{bmatrix}$$
By conducting OLS regression, we can get $\hat{\gamma_1}$, $\hat{\gamma_2}$ and the error covariance matrix $\Omega$.
Following equation (2), we can get:
$$\widehat{Var}(\hat{\gamma})=(X^TX)^{-1}X^T{\Omega}X(X^TX)^{-1}$$
Then, according to equation (1), we can get:
$$\widehat{Var}(\hat{\theta})\equiv(\hat{G})^{-1}\widehat{Var}(\hat{\gamma})(\hat{G}^T)^{-1}$$

2. nls(.) function

  • nls is a build-in function in R, which is used to determine the nonlinear (weighted) least-squares estimates of the parameters of a nonlinear model.

  • Usage of nls:

    1
    nls(formula, data, start, control, algorithm, trace, subset, weights, na.action, model, lower, upper, ...)

  Note: You can find out more explanation on parameter settings by browsing R help documentation.

Example
Suppose we still have the following regression:
$$Y_i=\theta_0+\frac{\alpha}{1-\alpha}X_1+\frac{1}{2}\frac{\sigma-1}{\sigma}\frac{\alpha}{(1-\alpha)^2}X_2$$
We can use nls to get the estimation of parameters $\alpha$, $\sigma$ directly.

1
2
3
#In this method, we need to tell R the approximate initial value of alpha and sigma in the first place.
lm=nls(y~constant+alpha/(1-alpha)*x1+1/2*(sigma-1)/sigma*alpha/(1-alpha)^2*x2,start=list(constant=5,alpha=0.3,sigma=1.5),trace=F)
summary(lm)

Notice on paper replication

  • You need to replicate the paper The Solow Model with CES Technology: Nonlinearities and Parameter Heterogeneity as the final assignment in this course. Related data can be download from here.
  • You only need to replicate the Table 1, Table 2 and Table 4.
  • You need to use Delta method to estimate the standard errors of parameters of Restricted Basic Solow-CD Model, Restricted Extended Solow-CD Model and Restricted Basic Solow-CES Model in Table 1 and Table 2.
  • You need to use dur_john.Rdata for Table 1. And you need to delete those following rows of data to get the right answer: 13, 14, 6, 19, 36, 44, 45, 50, 51, 56, 59, 62, 66, 68, 69, 72, 78, 81, 82, 91, 111, 114, 118.
  • You need to use datamp2.Rdata to replicate Table 2.
  • You need to use datamp1.Rdata to replicate Table 4.
  • Data description in the original file are not totally correct. Please see the following data description, which I have amended.

Data description

Winford H. Masanjala and Chris Papageorgiou, “The Solow Model with CES
Technology: Nonlinearities and Parameter Heterogeneity”, Journal of Applied
Econometrics, Vol. 19, No. 2, 2004, pp. 171-201.
 

Documentation for data in dur_john.Rdata

CODE=Country number in Summers-Heston dataset.
NONOIL=1 for nonoil producing countries.
INTER=1 for countries with better quality data.
OECD=1 for OECD countries.
GDP60=Per capita GDP in 1960.
GDP85=Per capita GDP in 1985.
GDPGRO=Average growth rate of per capita GDP (1960-1985).
POPGRO=Average growth rate of working-age population (1960-1985).
IONY=Average ratio of investment (including Government Investment) to GDP(1960-1985).
SCHOOL=Average fraction of working-age population enrolled in secondary school (1960-1985).
LIT60=fraction of the population over 15 years old that is able to read and write in 1960.
NA indicates that the observation is missing. This dataset has also being used in Durlauf and Johnson (JAE 1995).
 
There are 121 observations for each variable. All of the data with the exception of LIT60 are from Mankiw, Romer and Weil (QJE 1992), who in turn constructed the data from Penn World Tables 4.0. LIT60 is from the World Bank’s World Development Report.
 

Documentation for data in datamp1.txt

CODE=Country number in Summers-Heston dataset.
GDP60=Per capita GDP in 1960.
GDP85=Per capita GDP in 1985.
POPGRO=Average growth rate of working-age population (1960-1985).
IONY=Average ratio of investment to GDP (1960-1985).
SCHOOL=Average fraction of working-age population enrolled in secondary school (1960-1985).
LIT60=fraction of the population over 15 years old that is able to read and write in 1960.
 
There are 96 observations for each variable. All of the data with the exception of LIT60 are from Mankiw, Romer and Weil (QJE 1992) who in turn constructed the data from Penn World Tables 4.0. LIT60 is from the World Bank’s World Development Report.
 

Documentation for data in datamp2.txt

CODE=Country number in Summers-Heston dataset.
GDP60=Per capita GDP in 1960.
GDP85=Per capita GDP in 1985.
IONY=Average ratio of investment to GDP (1960-1995).
SCHOOL=Average fraction of working-age population enrolled in secondary school (1960-1995).
POPGRO=Average growth rate of working-age population (1960-1995).
 
There are 90 observations for each variable. All of the data are from Bernanke and Gurkaynak (NBER Macroeconomics Annual 2001) who constructed the data from Penn World Tables 6.0.

R Introduction Part 2

  • This is my lecture note written for undergraduate R programming course, which is a supplementary course of the Advanced Macroeconomics course.
  • Most figures and tables are extracted from R in Action.
  • All exercises are copied from my teacher Jing Fang‘s R Workshop, you can find the data for exercises on the website of R Workshop.

Regression wiht R

As you can see in the following table, the term regression can be confusing because there are so many specialized varieties. In this class we will only focus on OLS, Nonparametric Regression and Robust Regression.
Varieties of Regression Analysis

OLS regression

$$\hat{Y_i}=\hat{\beta_0}+\hat{\beta_1}X_{1,i}+…+\hat{\beta_k}X_{k,i} \qquad i=1…n$$
where $n$ is the number of observatiions and $k$ is the number of predictor variables. In this equation:
$\hat{Y_i}$: is the predicted value of the dependent variable for observation $i$ (specifically, it is the estimated mean of the $Y$ distribution, conditional on the set of predictor values).
$X_{j,i}$: is the $j^{th}$ predictor value for the $i^{th}$ observation.
$\hat{\beta_0}$: is the intercept (the predicted value of $Y$ when all the predictor variables equal 0).
$\hat{\beta_j}$: is the regression coefficient for the $j^{th}$ predictor (slope representing the change in $Y$ for a unit change in $X_j$).

To properly interpret the coefficients of the OLS model, you must satisfy a number of statistical assumptions:

  • Normality—For fixed values of the independent variables, the dependent
    variable is normally distributed.
  • Independence—The $Y_i$ values are independent of each other.
  • Linearity—The dependent variable is linearly related to the independent
    variables.
  • Homoscedasticity—The variance of the dependent variable doesn’t vary with the
    levels of the independent variables.

lm(.) function

  • OLS regression can be conducted by using lm function.
  • Usage of lm:
    1
    lm(formula, data, subset, weights, na.action, method = "qr", model = TRUE, x = FALSE, y = FALSE, qr= TRUE, singular.ok= TRUE, contrasts = NULL, offset, ...)

  Note: You can find out more explanation on parameter settings by browsing R help documentation.

  • An example:
    Suppose you need to conduct OLS regression on $y=c+x_1+x_2+…+x_n$, data have been stored in a data frame named mydata.
    You type the following commend in RStudio to tell R to conduct OLS regression:
    1
    2
    3
    4
    5
    reg=lm(y~x1+x2...+xn, mydata)
    ```
    &emsp;&emsp;Then you use the following commend to tell R to report the result:
    ```R
    summary(reg)

There are some symbols listed in the following table, which are commonly used in regression.
Symbols Commonly Used in Regression

Exercises

Exercises1

Regression diagnostics

  • Test whether the coefficients in regression model satisfy certain constrains.

    1
    2
    library(car)
    lht(reg, "the constrains you want to test")
  • Test whether some variables in your regression model are combined significant.
    1.Use anova function

    1
    2
    3
    4
    5
    6
    anova(reg1, reg2)
    ```
    &emsp;&emsp;2.Use `waldtest` function
    ```R
    library(lmtest)
    waldtest(reg1,reg2,vcov=vcovHC(reg2,type="HC0"))

  Q: What is vcovHC? (Hint: Please find the answer in R help documentation or Google.)

Exercises

Exercises2

Heteroskedasticity

  • What is heteroskedasticity?
    One of our assumptions for linear regression is that the random error terms in our regression have same variance. However, there are times that regression ends with heteroskedasticity, which means the random error terms have different variances.

  • What may happen if heteroskedasticity does exist in our regression?
    In this situation, t-test and F-test may become not reliable and misleading. Those supposed-to-be-significant coefficients are not significant any more.

  • How to discern heteroskedasticity?

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    # 1.Breusch-Pagan test
    reg=lm(y~x1+x2,mydata)
    bptest(reg)
    # 2.White test
    #This is one solution
    bptest(reg, ~x1+x2+I(x1^2)+I(x2^2)+x1:x2, mydata)
    #This is another one.
    library(bstats)
    white.test(reg)
  • How to get robust standard error when heteroskedasticity appears?

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    # The first method (recommended)
    library(sandwich)
    library(lmtest)
    coeftest(reg, vcov=vcovHC(reg,”HC0”))
    # The second method
    library(car)
    sqrt(diag(vcovHC(reg1,type="HC0")))
    # The third method
    library(car)
    sqrt(diag(hccm(reg1),type="hc0")))

Exercises

Exercises3

R Introduction Part 1

  • This is my lecture note written for undergraduate R programming course, which is a supplementary course of the Advanced Macroeconomics course.
  • Most figures and tables are extracted from R in Action.
  • All exercises are copied from my teacher Jing Fang‘s R Workshop, you can find the data for exercises on the website of R Workshop.

What is R ?

R was created by Ross Ihaka and Robert Gentleman at the University of Auckland, New Zealand, and is currently developed by the R Development Core Team, of which Chambers is a member. The R language is widely used among statisticians and data miners for developing statistical software and data analysis. For more information, please see Wiki: R(programming language))

Why choose R?

  1. R is a GNU project and totally free for everyone.
  2. R runs on a wide array of platforms, like Windows, Unix and Mac OS. You can even run R programming on your iPhone or iPad!
  3. R can easily import data from various types of data sources, including text files, database management systems, statistical packages, and specialized data repositories. It can write data out to these systems as well.
  4. R contains advanced statistical routines not yet available in other packages. In fact, new methods become available for download on a weekly basis.
  5. R has state-of-the-art graphics capabilities (try the package: ggplot2).
  6. R has a global community of more than 2 million users and developers who voluntarily contribute their time and technical expertise to maintain, support and extend the R language and its environment, tools and infrastructure.

Comparing R with other mainstream softwares

  • Stata: Widely used in Economics field.
  • SAS: excels at coping with big data, the syntax is awkward.
  • Matlab: has strong ability of numerical value calculation and is incredible large.
  • SPSS: widely used in Business Management.
  • Eviews: is good at Time Series Analysis.

Most important: They are all commercial software, which means they are not free to individual user.

Installation of R

  • How to get R?
    R can be download from CRAN
    R Download Page
  • RStudio
    RStudio is an amazing IDE for R. I strongly recommend you to run R program on RStudio.
    RStudio can be download from RStuio Website
    RStudio Download Page

Packages of R

One feature of R is that most functions can be achieved by the installation of different packages. There are 6523 packages on the websit of CRAN and the number is still growing.

  • Install packages
    1.Install packages manually
    Download package from This Page, RStudio->Tools->Install Packages
    2.Install packages by using command install.package()
    (1) Use command install.packages(), then choose the package you need.
    (2) Use command install.packages("lmtest"), R will install the “lmtest” package and those supporting packages automatically.
  1. Load packages by using command library()
    Use library(lmtest) to load the “lmtest” package before you want to use this package.
    Note: You only need to install packages once, but you need to reload those needed packages every time after you restart RStudio.
  2. Update packages
    Use command update.packages() to update those packages you have installed.

Other notes on using R

  • R is a case sensitive language.
  • = and <- can both be used as assignment symbol.
    Personally, I do not think there is a big difference between them and I highly recommend using = instead of <-. For more information on the subtle difference between these two symbol, please see stackoverflow and 知乎.
  • We use # in R to indicate the following part is comment and should not be compiled.
  • R cannot read setwd("c:\myprogram"), you should use ‘setwd(“c:/myprogram”)’ or setwd("c:\\myprogram").
  • Please wisely use help() or ?the_name_of_funciton to read help documentation. The website Rseeker and Google are also really helpful.

Create dataset

A dataset is usually a rectangular array of data with rows representing observations
and columns representing variables. The following table provides an example of a hypothetical
patient dataset.
A Patient Dataset

  • How to construct a dataset?
    1.Import or type data into the data structures.
    2.Choose a type of data structures to store data.

  • Data input
    From the following figure, we can see that R can cope with different data formats.
    Different Data Types
    read.table() #This can be used to read txt file, the argument na.strings="." should be used to convert the missing value (“.” in txt file) to NA value.
    read.csv() #This can be used to read CSV file, highly recommended.
    load() #This can be used to load Rdata file.
    Note: You can find out more information on how to input other different types of data in the book R in Action (R语言实战).

  • Data structures
    R has a wide variety of objects for holding data, including scalars, vectors, matrices,
    arrays, data frames, and lists. They differ in terms of the type of data they can hold,
    how they are created, their structural complexity, and the notation used to identify and
    access individual elements. The figure shows a diagram of these data structures.
    Different Data Structures

1.Vectors
Vectors are one-dimensional arrays that can hold numeric data, character data, or logical
data. The combine function c() is used to form the vector.

1
2
3
4
5
6
7
8
9
a=c(1,2,3,4) #This is a numeric vector.
b=c("one","two","three") #This is a character vector.
c=c(TRUE,FALSE,TRUE) #This is a logical vector.
```
2.Matrices
A matrix is a two-dimensional array where each element has the same mode (numeric,
character, or logical). Matrices are created with the `matrix` function.
```R
x=matrix(1:20, nrow=5, ncol=4) #This can create a 5*4 matrix using the number 1~20.

The comment operation symbols in matrix:
Dimensions of matrix x: dim(x)
Rows of matrix x: nrow(x)
Columns of matrix x: col(x)
Transpose of matrix x: t(x)
The value of the determinant of matrix x: det(x)
If the determinant is not 0, then we can get the inverse of matrix x: solve(x)
Eigenvalue and eigenvector: y=eigen(x), then y$val is eigenvalue, y$vec is eigenvector.
Multiplication of matrices: a %*% b
Arithmetic operations and power operation on every element of matrix: + - * / ^

3.Data frames
Personally, I think data frame is the most important and common data structure you will deal with in R. A data frame is more general than a matrix in that different columns can contain different modes of data (numeric, character, etc.).

1
2
3
4
5
6
patientID= c(1, 2, 3, 4)
age = c(25, 34, 28, 52)
diabetes = c("Type1", "Type2", "Type1", "Type1")
status = c("Poor", "Improved", "Excellent", "Poor")
patientdata= data.frame(patientID, age, diabetes, status)
patientdata #patientdata is a data frame.

The different ways to read data from data frame:

1
2
3
patientdata[1:2]
patientdata[c("diabetes", "status")]
patientdata$age

Useful mathematical functions and statistical functions

Mathematical Functions
Statistical Functions

Useful data operation functions

1.summary
summary is a generic function used to produce result summaries of the results of various model fitting functions.
summary(data$example) or summary(data)

2.sum
sum returns the sum of all the values present in its arguments.
sum(data$example, na.rm=TRUE)

3.nrow and ncol
nrow and ncol return the number of rows or columns present in x.

4.cbind and rbind
Take a sequence of vector, matrix or data-frame arguments and combine by columns or rows, respectively.

5.replace
replace replaces the values in x with indices given in list by those given in values.
data$V1=replace(data$V1,data$V1<5,0)
In this example, if the data in column V1 is less than 5, then those data will be converted to 0. This function is really useful when you want to convert your numerical variable to dummy variable.

6.subset
Return subsets of vectors, matrices or data frames which meet conditions.
subset(data, subset, select)

7.aggregate
Splits the data into subsets, computes summary statistics for each, and returns the result in a convenient form.
aggregate(data, by=list(data$variable), mean)
In this example, based on the column variable, R splits the data into subsets and computes mean of those subsets.

Reference books

Reference websites

Reference online courses