#!/usr/bin/perl
$wantse = 0;
$nofe = 0;
$savecomp = 0;
$analyze = 0;
$complim = 0;
$merge = 0;
$collect = 1;
$tol = 0;
$itsbig = 0;
$randkac = 0;
$bN = 0;
$exactDF = 0;
$robust = 'FALSE';
$usecg = 'FALSE';
$accel = 1;
$ivtest = 0;
while(<>) {
  chomp($_);
  # strip comments
  s/#.*$//;
  @line = split;
  next if($#line == -1);
  $cmd = shift @line;
  if($cmd eq 'file') {
    $datafile = shift @line;
  } elsif($cmd eq 'vars') {
    @filevars = @line;
  } elsif($cmd eq 'dummy') {
      @dummies = @line;
  } elsif($cmd eq 'response') {
      die "keyword 'response' no longer supported";
  } elsif($cmd eq 'covars') {
      die "keyword 'covars' no longer supported";
  } elsif($cmd eq 'exactdf') {
      $exactDF = 1;
  } elsif($cmd eq 'iv') {
      $iv = join(' ',@line);
  } elsif($cmd eq 'robust') {
      $robust = 'TRUE';
  } elsif($cmd eq 'cluster') {
      $clustervar = shift @line;
      $robust = 'TRUE';
  } elsif ($cmd eq 'model') {
      $model = join(' ',@line);
  } elsif($cmd eq 'fe') {
      die "keyword 'fe' no longer supported";
  } elsif($cmd eq 'se') {
      $bN = 0 + shift(@line);
      if($bN == 0) {$bN = 100;}
  } elsif($cmd eq 'out') {
    @outlist = @line;
  } elsif($cmd eq 'randkac') {
      $randkac=1;
  } elsif($cmd eq 'nofe') {
      $nofe = 1;
  } elsif($cmd eq 'test') {
      $testvars = '"'. join('","',@line) . '"';
  } elsif($cmd eq 'ivftest') {
      $ivtest = 1;
  } elsif($cmd eq 'savecomp') {
      $savecomp = 1;
  } elsif($cmd eq 'analyze') {
      $analyze = 1;
  } elsif($cmd eq 'merge') {
      $merge = 1;
  } elsif($cmd eq 'accel') {
      $accel = 1;
  } elsif($cmd eq 'complim') {
      $complim = 0+shift(@line);
  } elsif($cmd eq 'splitcomp') {
      $collect = 0;
  } elsif($cmd eq 'se') {
      $wantse = 0; # disabled due to lack of good algorithms
  } elsif($cmd eq 'itsbig') {
      $itsbig = 1;
  } elsif($cmd eq 'usecg') {
      $usecg = 'TRUE';
  } elsif($cmd eq 'tol') {
      $tol = shift(@line);
  } else {
    die "Unknown line starting with $cmd";
  }
}


if($datafile =~ m/\.dta$/i) {
    $format = 'dta';
} elsif ($datafile =~ m/\.rda|\.rdata/i) {
    $format = 'rda';
} else {
    $format = 'csv';
}

# either covars and response must be specified
# or model
#if(defined($model)) {
#    if(defined(@covars) || defined($resp)) {
#	print STDERR "WARNING: model line overrides covars and response \n";
#    }
#} elsif(!defined(@covars) || !defined($resp)) {
#    die "Either both covars and response must be defined, or model\n";
#} else {
#    $model = $resp . "~" . join('+',@covars);
#}

#if(defined(@fe)) {
#  $model .= '+G('.join(')+G(',@fe).')'
#}


# That's it, now, generate R-code
print "#switch off echo for non-interactive runs\n";
print "if(!interactive()) options(echo=FALSE)\n";

print "
dformat = '$format';
# show warnings as they occur
oldopts <- options(warn=1)
version <- packageDescription('lfe',fields='Version')
packdata <- packageDescription('lfe',fields='Packaged')
cat('******* This is the lfe',version,packdata,'\\n')
citation('lfe')
";
if($tol > 0) {print "options(lfe.eps=$tol)\n";}
print "forcefact <- c()\n";
if($#dummies >=0) {
    print "forcefact <- c('" .join("','",@dummies) . "')\n";
}

print "
require('lfe')
options(lfe.usecg=$usecg)
options(lfe.accel=$accel)
# read the datafile, with correct types, numeric for covariates,
# character for categories

model <- formula($model)
trm <- terms(model,special='G')
feidx <- attr(trm,'specials')\$G+1
va <- attr(trm,'variables')
festr <- paste(sapply(feidx,function(i) deparse(va[[i]])),collapse='+')
felist <- parse(text=paste('list(',gsub('+',',',festr,fixed=TRUE),')',sep=''))
nm <- eval(felist,list(G=function(arg) deparse(substitute(arg))))

#trm <- terms(model,specials='G')
#feidx <- attr(trm,'specials')\$G+1
#va <- attr(trm,'variables')
#festr <- paste(sapply(feidx,function(i) deparse(va[[i]])),collapse='+')

fvars <- all.vars(parse(text=festr))

if(dformat == 'dta') {
    library(foreign)
    cat(date(),'Start reading Stata file','$datafile\\n')
    tab <- read.dta('$datafile',convert.factors=NA)
    cat(date(),nrow(tab),' records read from','$datafile\\n')
} else if(dformat == 'rda') {
    load('$datafile')
} else {

  filevars = c('" . join("','",@filevars) . "')
  #What are their types on input?  
  # fe's should be read as character
  # covars and response as numeric
  # unused as character
  # also, those in array @dummies should be textual
  # so that they're coerced to factor
  # we know the fe's, and response, we must figure out the covars
  # used in model.  It should be parsed in R
  ";
  print "varnames <- all.vars(model)\n";
  if(defined($iv)) {
     print "ivspec <- $iv
     varnames <- unique(c(varnames,if(is.list(ivspec)) unlist(lapply(ivspec,all.vars)) else all.vars(ivspec)))"
  }
  print "
  typelist <- vector('list',length(filevars))
  names(typelist) <- filevars
  typelist[names(typelist) %in% varnames]  <- 0
  typelist[names(typelist) %in% forcefact] <- ''
  typelist[fvars] <- ''
  ";
  if(defined($clustervar)) {
    print "  typelist['$clustervar'] <- ''\n";
  }
  print "
  cat(date(),'Start reading from file','$datafile\\n')
  # it takes forever to create a data.frame with a large file
  # we stick to a list() from scan, but we convert the
  # factors manually
  #tab <- data.frame(scan('$datafile',what=typelist))
  tab <- scan('$datafile',what=typelist,multi.line=FALSE)
  # drop those we don't use, @@@@@ what about merging? @@@@@
  tab <- tab[unlist(lapply(typelist,function(t) !is.null(t)))];
  cat(date(),'File read\\n')
}

# convert to factors manually
for( f in forcefact) {
    tab[[f]] <- factor(tab[[f]])
}

if(!inherits(tab,'data.frame')) {
  # fake a data.frame
  #attr(tab,'row.names') <- 1:length(tab[[1]])
  class(tab) <- 'data.frame'
  attr(tab,'row.names') <- NULL
}

factors <- lfe:::parseformula(model,tab)[['fl']]

for(i in seq_along(factors)) {
  cat(date(),'There are',nlevels(factors[[i]]),names(factors)[[i]],'effects\\n')
}

cf <- compfactor(factors)
cat(date(),'There are',nlevels(cf),'connected components\\n')
if(length(factors) > 2) 
  cat(date(),'There are more than 2 dummy-groups, so nobody can help you if the dummies are non-estimable\\n')
cat(date(),'Centering tolerance is',getOption('lfe.eps'),'\\n')
cat(date(),'Centering variables and doing OLS on centered system...\\n\\n')
";
$cvar = '';
if(defined($clustervar)) {
    $cvar = ", clustervar='$clustervar'";
}
$edf = '';
if($exactDF == 1) {
    $edf = ", exactDOF=TRUE";
}
#print "print('felm($model,data=tab$cvar$edf)');print(names(tab))\n";
if(defined($iv)) {
    print "est <- felm($model,data=tab,iv=$iv$cvar$edf)\n";
} else {
  print "est <- felm($model,data=tab$cvar$edf)\n";
}
print "
robust <- $robust || !is.null(est\$clustervar)
if(!is.null(est\$stage1)) {
  cat('First stage estimation:\\n')
  for(lh in est\$stage1\$lhs) {
    print(summary(est\$stage1, robust=robust, lhs=lh))
  }
  cat('Second stage estimation:\\n')
}
if(length(est\$lhs) > 1) {
  for(lh in est\$lhs) {
    print(summary(est, robust=$robust, lhs=lh))
  }
} else {
  print(summary(est, robust=$robust))
}
";

if(defined($testvars)) {
print "
  message(date(), ' Computing F-tests for joint significance of endog. vars')
  for(lh in est\$lhs) {
   W <- waldtest(est, c($testvars), lhs=lh)
   message(\"F-test of joint IV significance(lhs=\",lh,\"): \", 
       formatC(W['F']), \" on \", W['df1'], \" and \", W['df2'], 
       \" df, p-value: \", format.pval(W['p.F']))
  }
  cat('\n')
";
}

if($ivtest) {
    print "
if(!is.null(est\$stage1)) {
message(date(), ' Computing conditional F-tests for IV')
print(condfstat(est));
} else {
message('**** conditional F-test specified, but no IV performed')
}
";
}

print "
if(length(est\$lhs) == 1) {
  se <- est\$se
} else {
  se <- do.call(cbind, lapply(est\$STATS, function(s) s\$se))
}
sink('coef.csv')
print(data.frame(value=est\$coefficients, se=se, row.names=rownames(est\$coefficients)))
sink(NULL)
cat(date(),'Coefficients written to coef.csv\\n')
";
if($savecomp) {
  print "
  fr <- data.frame(comp=cf,factors)
  for(i in seq_along(factors)) {
    tfr <- unique(fr[,c(1,i+1)])
    fn <- paste('comp-',names(factors)[[i]],'.csv',sep='')
    write.table(tfr,file=fn,row.names=FALSE,quote=FALSE)    
    cat(date(),'Components written to file',fn,'\\n')
  }

  ";
}

if($nofe) {
  print "cat(date(),'Fixed effects not requested, finishing\\n')
  options(oldopts)
  if(!interactive()) quit('no')";
  exit;
}

print "cat(date(),'Continuing with finding fixed effects\\n')";

print "

# So, that's it, if we're not interested in finding the
# fixed effects.  But if we are, we must pick up the
# residuals for the full model, i.e. use est to predict
# non-centered Y from non-centered X and find the residuals.

# We can't use predict directly since we've messed around
# with the intercept.  We do it manually

# We need to save some memory by ditching the large parts of est
# (the qr and stuff, keep only what's needed)
est <- est[c('fe','residuals','refnames','r.residuals','cfactor','lhs')]
invisible(gc())
class(est) <- NULL
";
if($bN > 0) {
    print "fes <- getfe(est,se=TRUE,bN=$bN)\n";
} else {
    print "fes <- getfe(est)\n";
}
print "
lhs <- est\$lhs
rm(est)
invisible(gc())
cat(date(),'Fixed effects found; just doing some bookkeeping now...\\n')

";

print "

# split the frame into categories
frms <- list()
effnam <- 'effect'
senam <- 'se'
if(length(lhs) > 1) effnam <- paste('effect',lhs,sep='.')
if(length(lhs) > 1) senam <- paste('se',lhs,sep='.')

for(nm in levels(fes[,'fe'])) {

  whch <- which(fes[,'fe'] == nm)
  fr <- fes[whch,]
  fn <- paste('fe-',nm,'.',dformat,sep='')  
  fr[,nm] <- fr[,'idx']
  rownames(fr) <- fr[,nm]
";
if($bN > 0) {
    print "fr <- fr[,c(nm,effnam,senam,'comp','obs')]\n";
} else {
    print "fr <- fr[,c(nm,effnam,'comp','obs')]\n";
}

print "
  if(dformat == 'dta') {
     write.dta(fr,fn,convert.factors='numeric')
  } else {
    write.table(fr,file=fn,quote=FALSE,row.names=FALSE)
  }
  frms[[nm]] <- fr
  cat(date(),'FE written to',fn,'\\n')      
}
";



# note to self.  Check whether package 'plyr' may be used for 
# faster merging.
if($merge) {
 print "
# stuff it back into the dataset

cat(date(),'Doing some more bookkeeping, i.e. merging...\\n')
class(tab) <- 'list'
tab[['comp']] <- cf
for(en in effnam) {
 for(f in names(frms)) {
   fr <- frms[[f]]
   fact <- tab[[f]]
   eff <- paste(f,en,sep='.')

   tab[[eff]] <- rep(0,length(fact))
   split(tab[[eff]],fact) <- fr[,en]
#   tab[[effnam]] <- fr[fact,'effect'] 
 }
}
class(tab) <- 'data.frame'

cat(date(),'Writing merged data set to fe-merged.',dformat,'\\n')
rownames(tab) <- 1:length(tab[[1]])
if(dformat == 'dta') {
  write.dta(tab,'fe-merged.dta')
} else {
  write.table(tab,'fe-merged.csv',quote=FALSE,row.names=FALSE)
}
";
}

print "
cat(date(),'Computation done\\n')
options(oldopts)
if(!interactive()) quit('no')
";
