Code: Post-estimation predictions with FD and FE regressions in R

Post-estimation predictions with panel data. Building alternative outcomes with different policy scenarios.

Data download

The example data is based on the publication article:

Cisneros, E., Zhou, S. L., and Börner, J. (2015): Naming and Shaming for Conservation: Evidence from the Brazilian Amazon, PLoS ONE, 10(9), 1 - 24, https://doi.org/10.1371/journal.pone.0136402

The paper analyses the effect of the blacklisting policy in the Brazilian Amazon on deforestation rates. Here we do not replicate the results of the paper, but produce a minimal dataset to demonstrate estimate the effect of the policy and predict the potential deforestation outcomes under two scenarios:

1) What would have happened without the policy, i.e. how much more deforestation would have occurred. 2) How much more deforestation would have occurred if the effect of the policy was 1.64 standard deviations below the average effect size.

library(dplyr)
library(lfe)

#download data
t<-tempfile()
tt<-tempdir()
download.file("https://doi.org/10.1371/journal.pone.0136402.s001",destfile=t)
unzip(zipfile=t,files="blacklist_data_20150623.csv",exdir=tt)
#read
d<-read.csv(file.path(tt,"blacklist_data_20150623.csv"))
#new variables
d$id<-d$municip_geocode_id
d$treat<-I(d$emb_cont_share)*1
#minimal data set
vlist<-c(
  "id","year","def_area","treat","gdppc_l1","nibamafines_no_l1"
)
d<-d[,vlist]
#logarithmic transformations (inv. hyp. sine)
for (v in c("def_area","treat","gdppc_l1","nibamafines_no_l1")){
  d[,paste0("asinh.",v)]<-asinh(d[,v])
}
#create first differences
a<-lapply(
  split(d,d$id),
  function(s){
    # s<-split(d,d$id)[[1]]
    n<-c("def_area","treat","gdppc_l1","nibamafines_no_l1")
    n<-c(n,paste0("asinh.",n))
    for (v in n){
      # print(v)
      s[,paste0("d.",v)]<-s[,v]-dplyr::lag(s[,v])
    }
    return(s)
  }
)
d<-do.call("rbind",a)
#for simplicity - set treatment to 1 for semi-treated observations
table(d$treat)
##
##                 0 0.166666666666667 0.333333333333333               0.5
##              5190                 7                 7                36
##                 1
##               172
i<-which(d$treat>0)
d[i,"treat"]<-1
#ordering is crucial
d<-d[order(d$id,d$year),]
#save
d.base<-d
# names(d)

The panel data set comprises 492 originally forested municipalities (id) between the years 2002 and 2012. The treatment indicator, treat, turns one when a municipality was set on a deforestation blacklist. def_area is the yearly measure of deforestation at the municipality level in square meters. gdppc_l1 denotes a the one-year lagged per capita GDP and nibamafines_no_l1 are the lagged number of fines issued by the environmental enforcement agency IBAMA.

summary(d[,c("id","def_area","treat","gdppc_l1","nibamafines_no_l1")])
##        id             def_area             treat            gdppc_l1
##  Min.   :1100015   Min.   :0.000e+00   Min.   :0.00000   Min.   :  1314
##  1st Qu.:1303945   1st Qu.:1.567e+06   1st Qu.:0.00000   1st Qu.:  3935
##  Median :1506252   Median :6.180e+06   Median :0.00000   Median :  6448
##  Mean   :2158849   Mean   :2.830e+07   Mean   :0.04102   Mean   :  9318
##  3rd Qu.:2105456   3rd Qu.:2.504e+07   3rd Qu.:0.00000   3rd Qu.: 11273
##  Max.   :5108956   Max.   :1.308e+09   Max.   :1.00000   Max.   :158973
##  nibamafines_no_l1
##  Min.   :   0.00
##  1st Qu.:   0.00
##  Median :   5.00
##  Mean   :  17.63
##  3rd Qu.:  16.00
##  Max.   :2218.00

Fixed effect (FE) estimation

Estimation

f<-"asinh.def_area ~ treat + asinh.gdppc_l1 + asinh.nibamafines_no_l1 | id + year | 0 | id"
f<-as.formula(f)
e<-felm(f,d)
summary(e)
##
## Call:
##    felm(formula = f, data = d)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -12.6536  -0.5328   0.0494   0.5912  15.4182
##
## Coefficients:
##                         Estimate Cluster s.e. t value Pr(>|t|)
## treat                   -0.48951      0.15467  -3.165  0.00165 **
## asinh.gdppc_l1           0.40076      0.17006   2.357  0.01884 *
## asinh.nibamafines_no_l1 -0.01169      0.03043  -0.384  0.70100
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.041 on 4907 degrees of freedom
## Multiple R-squared(full model): 0.687   Adjusted R-squared: 0.6549
## Multiple R-squared(proj model): 0.002349   Adjusted R-squared: -0.1001
## F-statistic(full model, *iid*):21.37 on 504 and 4907 DF, p-value: < 2.2e-16
## F-statistic(proj model):  5.36 on 3 and 491 DF, p-value: 0.001224
e.base<-e

Add fixed effects to the data frame

d<-d.base
e<-e.base
fe<-getfe(e)
table(fe$fe) # two types of fixed effects
##
##   id year
##  492   11
# split up fixed effects in two objects
fe.year<-fe[fe$fe=="year",]
fe.id<-fe[fe$fe=="id",]
names(fe.year)[1]<-"fe.year"
names(fe.id)[1]<-"fe.id"
# add FE effects ot data frame
d<-merge(d,fe.id[,c("idx","fe.id")]  ,by.x="id",by.y="idx",all.x=T)
d<-merge(d,fe.year[,c("idx","fe.year")],by.x="year",by.y="idx",all.x=T)
d<-d[order(d$id,d$year),]
d.fe<-d

Add fitted values to the data frame

d<-d.fe
e<-e.base
d$p1 <- c(e$fitted.values)
d.fit<-d

Check if prediction works

m<-d[,c("treat","asinh.gdppc_l1","asinh.nibamafines_no_l1")]

x<- e$fitted.values - c(as.matrix(m) %*% e$coefficients + d[,"fe.year"] +d[,"fe.id"]   )
table(round(x,9))
##
## -2e-09 -1e-09      0  1e-09  2e-09
##      1    306   4815    268     22
table(round(x,8)) #have to be all near zero
##
##    0
## 5412

Prediction with manipulated data set

d<-d.fe
e<-e.base
m<-d[,c("treat","asinh.gdppc_l1","asinh.nibamafines_no_l1")]
# manipulate data
mm<-m
table(m$treat)
##
##    0    1
## 5190  222
i<-which(mm$treat>0)
mm[i,"treat"]<-0
# predicted outcomes
x<-c(as.matrix(m) %*% e$coefficients + d[,"fe.year"] +d[,"fe.id"] ) #fitted
xx<-c(as.matrix(mm) %*% e$coefficients + d[,"fe.year"] +d[,"fe.id"] ) #manipulated
# marginal impact
mean(xx[i]) - mean(x[i])
## [1] 0.4895143
coefficients(e)["treat"] # similar ?
##      treat
## -0.4895143
# total impact
(sum(sinh(xx[i]))-sum(sinh(x[i])))/1000^2
## [1] 9572.927
(sum(sinh(xx))-sum(sinh(x)))/1000^2
## [1] 9572.927

Prediction with manipulated estimates

d<-d.fe
e<-e.base
m<-d[,c("treat","asinh.gdppc_l1","asinh.nibamafines_no_l1")]
#manipulate estimate
# decrease effect size by 1.67 SD to the lower bound of the 90% conf. interval
est<-coefficients(e)
est["treat"]<-est["treat"] + 1.645 * sqrt(diag(vcov(e)))["treat"]
#predict outcomes
x<-c(as.matrix(m) %*% e$coefficients + d[,"fe.year"] +d[,"fe.id"] ) #fitted
xx<-c(as.matrix(m) %*% est           + d[,"fe.year"] +d[,"fe.id"]   )
# marginal impact
mean(xx[i]) - mean(x[i])
## [1] 0.2544269
# total impact
(sum(sinh(xx[i]))-sum(sinh(x[i])))/1000^2
## [1] 4391.744

Out of sample prediction

Note: Predictions after FE estimations necessarily need fixed effect values per unit - here: d[,"fe.id"]. That means, that it is not possible to predict out of sample into new observations. One would have to assume some unit-FE (or try to predict such unit-FE) or switch to a first difference estimation, which only uses year-fixed-effects.

First-Difference FD estimation

Estimation

f<-"d.asinh.def_area ~ d.treat + d.asinh.gdppc_l1 + d.asinh.nibamafines_no_l1 | year | 0 | id"
f<-as.formula(f)
e<-felm(f,d)
summary(e)
##
## Call:
##    felm(formula = f, data = d)
##
## Residuals:
##      Min       1Q   Median       3Q      Max
## -18.1858  -0.4537   0.0817   0.5228  19.4110
##
## Coefficients:
##                           Estimate Cluster s.e. t value Pr(>|t|)
## d.treat                   -0.86766      0.19375  -4.478  7.7e-06 ***
## d.asinh.gdppc_l1          -0.01062      0.13746  -0.077    0.938
## d.asinh.nibamafines_no_l1  0.03621      0.02182   1.660    0.097 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.947 on 4907 degrees of freedom
##   (492 observations deleted due to missingness)
## Multiple R-squared(full model): 0.05821   Adjusted R-squared: 0.05591
## Multiple R-squared(proj model): 0.001532   Adjusted R-squared: -0.0009098
## F-statistic(full model, *iid*):25.28 on 12 and 4907 DF, p-value: < 2.2e-16
## F-statistic(proj model): 7.567 on 3 and 491 DF, p-value: 5.886e-05
e.base<-e

Add fixed effects to the data frame

d<-d.base
e<-e.base
fe<-getfe(e)
table(fe$fe) # here only year fixed effects
##
## year
##   10
fe.year<-fe[fe$fe=="year",]
names(fe.year)[1]<-"fe.year"
# add FE effects ot data frame
d<-merge(d,fe.year[,c("idx","fe.year")],by.x="year",by.y="idx",all.x=T)
d<-d[order(d$id,d$year),]
d.fe<-d

Add fitted values to the data frame

d<-d.fe
e<-e.base
z<-data.frame(
  id = e$clustervar$id,
  year = e$fe$year,
  p1 = c(e$fitted.values)
)
d<-merge(d,z,by=c("id","year"),all.x=T)
# d[d$id==1100015,]
# prediction in level
a<-lapply(
  split(d,d$id),
  function(s){
    # s<-split(d,d$id)[[1]]
      s[2,"x"]<-s[1,"asinh.def_area"]+s[2,"p1"]
    for (i in 3:nrow(s)){
      s[i,"x"]<-s[(i-1),"x"]+s[i,"p1"]
    }
    s$p1l<-sinh(s$x)
    s$x<-NULL
    return(s)
  }
)
d<-do.call("rbind",a)
d.fit<-d

Check if prediction works

d<-d.fit
e<-e.base
x<-d[,c("d.treat","d.asinh.gdppc_l1","d.asinh.nibamafines_no_l1")]
d$p2 <- c(as.matrix(x) %*% e$coefficients + d[,"fe.year"]   )
table(round(d$p1-d$p2,12),useNA="always")
##
##    0 <NA>
## 4920  492

Prediction with manipulated data set

d<-d.fit
e<-e.base
# set treatment to zero
d$treat<-0
d$d.treat<-0
x<-d[,c("d.treat","d.asinh.gdppc_l1","d.asinh.nibamafines_no_l1")]
d$p2 <- c(as.matrix(x) %*% e$coefficients + d[,"fe.year"]   )
# prediction in level
a<-lapply(
  split(d,d$id),
  function(s){
    # s<-split(d,d$id)[[1]]
      s[2,"x"]<-s[1,"asinh.def_area"]+s[2,"p2"]
    for (i in 3:nrow(s)){
      s[i,"x"]<-s[(i-1),"x"]+s[i,"p2"]
    }
    s$p2l<-sinh(s$x)
    s$x<-NULL
    return(s)
  }
)
d<-do.call("rbind",a)
#impact in level
(sum(d$p1l,na.rm=T)-sum(d$p2l,na.rm=T))/1000^2
## [1] -9512.9

Prediction with manipulated estimates

d<-d.fit
e<-e.base
# reduce effect size by 1.67 to the lower bound of the 90% conf. interval
est<-coefficients(e)
est["d.treat"]<-est["d.treat"] + 1.645 * sqrt(diag(vcov(e)))["d.treat"]
x<-d[,c("d.treat","d.asinh.gdppc_l1","d.asinh.nibamafines_no_l1")]
d$p2 <- c(as.matrix(x) %*% est + d[,"fe.year"]   )
# prediction in level
a<-lapply(
  split(d,d$id),
  function(s){
    # s<-split(d,d$id)[[1]]
      s[2,"x"]<-s[1,"asinh.def_area"]+s[2,"p2"]
    for (i in 3:nrow(s)){
      s[i,"x"]<-s[(i-1),"x"]+s[i,"p2"]
    }
    s$p2l<-sinh(s$x)
    s$x<-NULL
    return(s)
  }
)
d<-do.call("rbind",a)
#impact in level
(sum(d$p1l,na.rm=T)-sum(d$p2l,na.rm=T))/1000^2
## [1] -2718.87

Out of sample prediction

Note: Predictions after FD estimations only need year fixed effects. That means, that it is possible to predict out of sample into new observations.