Code: Post-estimation predictions with FD and FE regressions in R
Post-estimation predictions with panel data. Building alternative outcomes with different policy scenarios.
Data download
The example data is based on the publication article:
Cisneros, E., Zhou, S. L., and Börner, J. (2015): Naming and Shaming for Conservation: Evidence from the Brazilian Amazon, PLoS ONE, 10(9), 1 - 24, https://doi.org/10.1371/journal.pone.0136402
The paper analyses the effect of the blacklisting policy in the Brazilian Amazon on deforestation rates. Here we do not replicate the results of the paper, but produce a minimal dataset to demonstrate estimate the effect of the policy and predict the potential deforestation outcomes under two scenarios:
1) What would have happened without the policy, i.e. how much more deforestation would have occurred. 2) How much more deforestation would have occurred if the effect of the policy was 1.64 standard deviations below the average effect size.
library(dplyr)
library(lfe)
#download data
t<-tempfile()
tt<-tempdir()
download.file("https://doi.org/10.1371/journal.pone.0136402.s001",destfile=t)
unzip(zipfile=t,files="blacklist_data_20150623.csv",exdir=tt)
#read
d<-read.csv(file.path(tt,"blacklist_data_20150623.csv"))
#new variables
d$id<-d$municip_geocode_id
d$treat<-I(d$emb_cont_share)*1
#minimal data set
vlist<-c(
"id","year","def_area","treat","gdppc_l1","nibamafines_no_l1"
)
d<-d[,vlist]
#logarithmic transformations (inv. hyp. sine)
for (v in c("def_area","treat","gdppc_l1","nibamafines_no_l1")){
d[,paste0("asinh.",v)]<-asinh(d[,v])
}
#create first differences
a<-lapply(
split(d,d$id),
function(s){
# s<-split(d,d$id)[[1]]
n<-c("def_area","treat","gdppc_l1","nibamafines_no_l1")
n<-c(n,paste0("asinh.",n))
for (v in n){
# print(v)
s[,paste0("d.",v)]<-s[,v]-dplyr::lag(s[,v])
}
return(s)
}
)
d<-do.call("rbind",a)
#for simplicity - set treatment to 1 for semi-treated observations
table(d$treat)
##
## 0 0.166666666666667 0.333333333333333 0.5
## 5190 7 7 36
## 1
## 172
i<-which(d$treat>0)
d[i,"treat"]<-1
#ordering is crucial
d<-d[order(d$id,d$year),]
#save
d.base<-d
# names(d)
The panel data set comprises 492 originally forested municipalities
(id
) between the years 2002 and 2012. The treatment indicator,
treat
, turns one when a municipality was set on a deforestation
blacklist. def_area
is the yearly measure of deforestation at the
municipality level in square meters. gdppc_l1
denotes a the one-year
lagged per capita GDP and nibamafines_no_l1
are the lagged number of
fines issued by the environmental enforcement agency IBAMA.
summary(d[,c("id","def_area","treat","gdppc_l1","nibamafines_no_l1")])
## id def_area treat gdppc_l1
## Min. :1100015 Min. :0.000e+00 Min. :0.00000 Min. : 1314
## 1st Qu.:1303945 1st Qu.:1.567e+06 1st Qu.:0.00000 1st Qu.: 3935
## Median :1506252 Median :6.180e+06 Median :0.00000 Median : 6448
## Mean :2158849 Mean :2.830e+07 Mean :0.04102 Mean : 9318
## 3rd Qu.:2105456 3rd Qu.:2.504e+07 3rd Qu.:0.00000 3rd Qu.: 11273
## Max. :5108956 Max. :1.308e+09 Max. :1.00000 Max. :158973
## nibamafines_no_l1
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 5.00
## Mean : 17.63
## 3rd Qu.: 16.00
## Max. :2218.00
Fixed effect (FE) estimation
Estimation
f<-"asinh.def_area ~ treat + asinh.gdppc_l1 + asinh.nibamafines_no_l1 | id + year | 0 | id"
f<-as.formula(f)
e<-felm(f,d)
summary(e)
##
## Call:
## felm(formula = f, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.6536 -0.5328 0.0494 0.5912 15.4182
##
## Coefficients:
## Estimate Cluster s.e. t value Pr(>|t|)
## treat -0.48951 0.15467 -3.165 0.00165 **
## asinh.gdppc_l1 0.40076 0.17006 2.357 0.01884 *
## asinh.nibamafines_no_l1 -0.01169 0.03043 -0.384 0.70100
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.041 on 4907 degrees of freedom
## Multiple R-squared(full model): 0.687 Adjusted R-squared: 0.6549
## Multiple R-squared(proj model): 0.002349 Adjusted R-squared: -0.1001
## F-statistic(full model, *iid*):21.37 on 504 and 4907 DF, p-value: < 2.2e-16
## F-statistic(proj model): 5.36 on 3 and 491 DF, p-value: 0.001224
e.base<-e
Add fixed effects to the data frame
d<-d.base
e<-e.base
fe<-getfe(e)
table(fe$fe) # two types of fixed effects
##
## id year
## 492 11
# split up fixed effects in two objects
fe.year<-fe[fe$fe=="year",]
fe.id<-fe[fe$fe=="id",]
names(fe.year)[1]<-"fe.year"
names(fe.id)[1]<-"fe.id"
# add FE effects ot data frame
d<-merge(d,fe.id[,c("idx","fe.id")] ,by.x="id",by.y="idx",all.x=T)
d<-merge(d,fe.year[,c("idx","fe.year")],by.x="year",by.y="idx",all.x=T)
d<-d[order(d$id,d$year),]
d.fe<-d
Add fitted values to the data frame
d<-d.fe
e<-e.base
d$p1 <- c(e$fitted.values)
d.fit<-d
Check if prediction works
m<-d[,c("treat","asinh.gdppc_l1","asinh.nibamafines_no_l1")]
x<- e$fitted.values - c(as.matrix(m) %*% e$coefficients + d[,"fe.year"] +d[,"fe.id"] )
table(round(x,9))
##
## -2e-09 -1e-09 0 1e-09 2e-09
## 1 306 4815 268 22
table(round(x,8)) #have to be all near zero
##
## 0
## 5412
Prediction with manipulated data set
d<-d.fe
e<-e.base
m<-d[,c("treat","asinh.gdppc_l1","asinh.nibamafines_no_l1")]
# manipulate data
mm<-m
table(m$treat)
##
## 0 1
## 5190 222
i<-which(mm$treat>0)
mm[i,"treat"]<-0
# predicted outcomes
x<-c(as.matrix(m) %*% e$coefficients + d[,"fe.year"] +d[,"fe.id"] ) #fitted
xx<-c(as.matrix(mm) %*% e$coefficients + d[,"fe.year"] +d[,"fe.id"] ) #manipulated
# marginal impact
mean(xx[i]) - mean(x[i])
## [1] 0.4895143
coefficients(e)["treat"] # similar ?
## treat
## -0.4895143
# total impact
(sum(sinh(xx[i]))-sum(sinh(x[i])))/1000^2
## [1] 9572.927
(sum(sinh(xx))-sum(sinh(x)))/1000^2
## [1] 9572.927
Prediction with manipulated estimates
d<-d.fe
e<-e.base
m<-d[,c("treat","asinh.gdppc_l1","asinh.nibamafines_no_l1")]
#manipulate estimate
# decrease effect size by 1.67 SD to the lower bound of the 90% conf. interval
est<-coefficients(e)
est["treat"]<-est["treat"] + 1.645 * sqrt(diag(vcov(e)))["treat"]
#predict outcomes
x<-c(as.matrix(m) %*% e$coefficients + d[,"fe.year"] +d[,"fe.id"] ) #fitted
xx<-c(as.matrix(m) %*% est + d[,"fe.year"] +d[,"fe.id"] )
# marginal impact
mean(xx[i]) - mean(x[i])
## [1] 0.2544269
# total impact
(sum(sinh(xx[i]))-sum(sinh(x[i])))/1000^2
## [1] 4391.744
Out of sample prediction
Note: Predictions after FE estimations necessarily need fixed effect
values per unit - here: d[,"fe.id"]
. That means, that it is not
possible to predict out of sample into new observations. One would have
to assume some unit-FE (or try to predict such unit-FE) or switch to a
first difference estimation, which only uses year-fixed-effects.
First-Difference FD estimation
Estimation
f<-"d.asinh.def_area ~ d.treat + d.asinh.gdppc_l1 + d.asinh.nibamafines_no_l1 | year | 0 | id"
f<-as.formula(f)
e<-felm(f,d)
summary(e)
##
## Call:
## felm(formula = f, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.1858 -0.4537 0.0817 0.5228 19.4110
##
## Coefficients:
## Estimate Cluster s.e. t value Pr(>|t|)
## d.treat -0.86766 0.19375 -4.478 7.7e-06 ***
## d.asinh.gdppc_l1 -0.01062 0.13746 -0.077 0.938
## d.asinh.nibamafines_no_l1 0.03621 0.02182 1.660 0.097 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.947 on 4907 degrees of freedom
## (492 observations deleted due to missingness)
## Multiple R-squared(full model): 0.05821 Adjusted R-squared: 0.05591
## Multiple R-squared(proj model): 0.001532 Adjusted R-squared: -0.0009098
## F-statistic(full model, *iid*):25.28 on 12 and 4907 DF, p-value: < 2.2e-16
## F-statistic(proj model): 7.567 on 3 and 491 DF, p-value: 5.886e-05
e.base<-e
Add fixed effects to the data frame
d<-d.base
e<-e.base
fe<-getfe(e)
table(fe$fe) # here only year fixed effects
##
## year
## 10
fe.year<-fe[fe$fe=="year",]
names(fe.year)[1]<-"fe.year"
# add FE effects ot data frame
d<-merge(d,fe.year[,c("idx","fe.year")],by.x="year",by.y="idx",all.x=T)
d<-d[order(d$id,d$year),]
d.fe<-d
Add fitted values to the data frame
d<-d.fe
e<-e.base
z<-data.frame(
id = e$clustervar$id,
year = e$fe$year,
p1 = c(e$fitted.values)
)
d<-merge(d,z,by=c("id","year"),all.x=T)
# d[d$id==1100015,]
# prediction in level
a<-lapply(
split(d,d$id),
function(s){
# s<-split(d,d$id)[[1]]
s[2,"x"]<-s[1,"asinh.def_area"]+s[2,"p1"]
for (i in 3:nrow(s)){
s[i,"x"]<-s[(i-1),"x"]+s[i,"p1"]
}
s$p1l<-sinh(s$x)
s$x<-NULL
return(s)
}
)
d<-do.call("rbind",a)
d.fit<-d
Check if prediction works
d<-d.fit
e<-e.base
x<-d[,c("d.treat","d.asinh.gdppc_l1","d.asinh.nibamafines_no_l1")]
d$p2 <- c(as.matrix(x) %*% e$coefficients + d[,"fe.year"] )
table(round(d$p1-d$p2,12),useNA="always")
##
## 0 <NA>
## 4920 492
Prediction with manipulated data set
d<-d.fit
e<-e.base
# set treatment to zero
d$treat<-0
d$d.treat<-0
x<-d[,c("d.treat","d.asinh.gdppc_l1","d.asinh.nibamafines_no_l1")]
d$p2 <- c(as.matrix(x) %*% e$coefficients + d[,"fe.year"] )
# prediction in level
a<-lapply(
split(d,d$id),
function(s){
# s<-split(d,d$id)[[1]]
s[2,"x"]<-s[1,"asinh.def_area"]+s[2,"p2"]
for (i in 3:nrow(s)){
s[i,"x"]<-s[(i-1),"x"]+s[i,"p2"]
}
s$p2l<-sinh(s$x)
s$x<-NULL
return(s)
}
)
d<-do.call("rbind",a)
#impact in level
(sum(d$p1l,na.rm=T)-sum(d$p2l,na.rm=T))/1000^2
## [1] -9512.9
Prediction with manipulated estimates
d<-d.fit
e<-e.base
# reduce effect size by 1.67 to the lower bound of the 90% conf. interval
est<-coefficients(e)
est["d.treat"]<-est["d.treat"] + 1.645 * sqrt(diag(vcov(e)))["d.treat"]
x<-d[,c("d.treat","d.asinh.gdppc_l1","d.asinh.nibamafines_no_l1")]
d$p2 <- c(as.matrix(x) %*% est + d[,"fe.year"] )
# prediction in level
a<-lapply(
split(d,d$id),
function(s){
# s<-split(d,d$id)[[1]]
s[2,"x"]<-s[1,"asinh.def_area"]+s[2,"p2"]
for (i in 3:nrow(s)){
s[i,"x"]<-s[(i-1),"x"]+s[i,"p2"]
}
s$p2l<-sinh(s$x)
s$x<-NULL
return(s)
}
)
d<-do.call("rbind",a)
#impact in level
(sum(d$p1l,na.rm=T)-sum(d$p2l,na.rm=T))/1000^2
## [1] -2718.87
Out of sample prediction
Note: Predictions after FD estimations only need year fixed effects. That means, that it is possible to predict out of sample into new observations.