#!/usr/local/bin/perl #=============================================================================== # # Source code: # # C code: www.datashaping.com/regress3.c.txt (well documented, fast) # Perl Code: www.datashaping.com/regress_pl.txt (not documented) # Documentation: www.datashaping.com/software.shtml # # Help: www.datashaping.com/contact.shtml # Developed by Vincent Granville, Data Shaping Solutions # # This header can not be deleted. As long as this condition is satisfied, # this software can be distributed freely. # # Purpose: # # Linear and ridge regression based on iterative robust algorithm and # boostrap to estimate parameters and compute confidence intervals for # regression parameters. Efficiently handles large number of variables. # #===============================================================================*/ # # The data is read from tab-separated ASCI file rg.txt; # each column is saved into a separate file rg_xx.txt. # # input file rg.txt = data set; column 0 = dependent var # output files rg_xx.txt are individual columns # output file rg_err.txt = error # output file rg_log.txt = program logs # output @param = regression parameters; # output var $obs = number of observations # output var $var = number of columns in input file # $init=-1; $obs=-1; $var=-1; $seed=-1; $MSE=0; $MSE_init=0; @param=(); @param_seed=(); `del rg_*.txt`; # on UNIX, use "rm" instead of "del" $time1=time(); &Regress_init(); &Bootstrap(2,0,50); #&Validation(2,10,150); ## 2 / 50 / 2 print "\n\n"; #&Init_param(2,50,10); &Regress(3,250,"REGRESSION",0); #&Submodels(); #&Summary(); $resvar = 1-$MSE/$MSE_init; printf "\nInitial MSE = $MSE_init\n"; printf "Final MSE = $MSE\n"; printf "Variance reduction: %f \%\n\n",100*$resvar; printf "Regression parameters:\n\n"; for($l=1; $l<$var; $l++) { printf "a[%d] = %f\n",$l,$param[$l]; } $time2=time(); $dtime=$time2-$time1; print "\nTime elapsed: $dtime seconds.\n"; #============================================================ sub Validation{ local($mode,$nvalid,$niter)=@_; local($nsvar,$n,$label); # global: *param &Regress_init("NOPRINT"); for ($n=0; $n< $nvalid; $n++) { print "\nVALIDATION Test $n\n\n"; $label="VALIDATION_"."$n"; $nsvar=&Regress($mode,$niter,$label); } } #============================================================ sub Init_param{ local($mode,$nvalid,$niter)=@_; local($nsvar,$nsvar_max,$k,$n,$label); # global: *param_seed, *param, $seed # output: *param_seed, $seed $nsvar_max=-1; @param_seed=(); &Regress_init("NOPRINT"); for ($n=0; $n< $nvalid; $n++) { print "\nINITPARAM Test $n\n\n"; $label="INITPARAM"."$n"; $nsvar=&Regress($mode,$niter,$label,0); if ($nsvar>$nsvar_max) { $nsvar_max=$nsvar; for ($k=1; $k<$var; $k++) { $param_seed[$k]=$param[$k]; } } } $seed=1; } #============================================================ sub Summary{ local($i,$cols,$iter,*lu); $maxIterBootstrap=-1; $maxIterValidation=-1; open(LOG,") { @lu=split(/\t/,$i); $cols=$#lu+1; # number of cols in rg_log.txt $label=$lu[0]; $iter=$lu[1]; if (($label=~ "BOOTSTR")&&($iter>$maxIterBootstrap)) { $maxIterBootstrap=$iter; } if (($label=~ "VALIDATION")&&($iter>$maxIterValidation)) { $maxIterValidation=$iter; } } close(LOG); for($k=4; $k<$cols; $k++) { $minBootstrap[$k]=999999999; $maxBootstrap[$k]=-999999999; $minValidation[$k]=999999999; $maxValidation[$k]=-999999999; } open(LOG,") { $i =~ s/\n//g; @lu=split(/\t/,$i); $label=$lu[0]; $iter=$lu[1]; if (($label =~ "BOOTSTR")&&($iter==$maxIterBootstrap)) { for ($k=4; $k<$cols; $k++) { if ($lu[$k]>$maxBootstrap[$k]) { $maxBootstrap[$k]=$lu[$k]; } if ($lu[$k]<$minBootstrap[$k]) { $minBootstrap[$k]=$lu[$k]; } } } if (($label =~ "VALIDATION")&&($iter==$maxIterValidation)) { for ($k=4; $k<$cols; $k++) { if ($lu[$k]>$maxValidation[$k]) { $maxValidation[$k]=$lu[$k]; } if ($lu[$k]<$minValidation[$k]) { $minValidation[$k]=$lu[$k]; } } } } close(LOG); open(LOG,">>rg_log.txt"); print LOG "SUMMARY\tBOOTSTRAP\tMIN\t-"; for ($k=4; $k<$cols; $k++) { print LOG "\t$minBootstrap[$k]"; } print LOG "\nSUMMARY\tBOOTSTRAP\tMAX\t-"; for ($k=4; $k<$cols; $k++) { print LOG "\t$maxBootstrap[$k]"; } print LOG "\nSUMMARY\tVALIDATION\tMIN\t-"; for ($k=4; $k<$cols; $k++) { print LOG "\t$minValidation[$k]"; } print LOG "\nSUMMARY\tVALIDATION\tMAX\t-"; for ($k=4; $k<$cols; $k++) { print LOG "\t$maxValidation[$k]"; } close(LOG); } #============================================================ sub Bootstrap{ local($nsample,$mode,$niter)=@_; local($k,$n,$m,$nobs,$idx,$i,*pick); `copy rg.txt rg_sec.txt`; $obs=0; open(DATA,"< rg_sec.txt"); while ($i=) { $obs++; } close(DATA); for ($n=0; $n<$nsample; $n++) { # pick up observations print "\nBOOTSTRAP subsample $n\n"; @pick=(); for ($k=0; $k<$obs;$k++) { $idx=int($obs*rand()); $pick[$idx]++; } # create subsample open(DATA,"< rg_sec.txt"); open(SAMPLE,">rg.txt"); $m=0; $nobs=0; while ($i=) { for ($k=0; $k<$pick[$m]; $k++) { $i =~ s/\n//g; if ($nobs == $obs-1) { print SAMPLE $i; # no \n to avoid creating an extra empty row at the end of dataset } else { print SAMPLE "$i\n"; } $nobs++; } $m++; } close(SAMPLE); close(DATA); $label="BOOTSTR_"."$n"; &Regress_init("NOPRINT"); &Regress($mode,$niter,$label); } `copy rg_sec.txt rg.txt`; &Regress_init(); } #============================================================ sub Regress{ local($mode,$niter,$label,$seed_flag)=@_; local($k,$l,$i,$iter,$xd,$sp,$lambda,*lu); local(*col,*err); # global: *param, obs, seed, MSE, MSE_init # need to run Regress_init first if the data set is new if ($init != 1) { print "Must run Regress_init() first."; exit(1); } open(LOG,">>rg_log.txt"); open(RG0,"; close(RG0); open(ERR,">rg_err.txt"); $obs=$#err+1; $MSE_init=0; for ($k=0; $k<$obs; $k++) { $MSE_init+= $err[$k] * $err[$k]; printf ERR "$err[$k]"; } close(ERR); $MSE_init=sqrt($MSE_init/$obs); for ($k=1; $k<$var; $k++) { $param[$k]=0; } ## if init=1 uses initial regressors if ($seed_flag==1) { if ($seed !=1) { print "Must run Init_param() first.\n"; exit(2); } for ($k=1; $k<$var;$k++) { $param[$k]=$param_seed[$k]; } open(RG,"rg_err.txt"); while ($i=) { $error=0; @lu=split(/\t/,$i); for ($k=1; $k<$var; $k++) { $error+=$param[$k]*$lu[$k]; } $error=$lu[0]-$error; print ERR "$error\n"; } close(ERR); close(RG); print "\n"; } ## regression for ($iter=0; $iter< $niter; $iter++) { if (($mode == 0)||($mode ==3)) { $l= 1 + ($iter % ($var - 1)); } else { $l = 1 + int(($var - 1)*rand()); } open(COL,"; close(COL); open(ERR,"; close(ERR); $xd=0; $sp=0; for ($k=0; $k<$obs; $k++) { $xd+=$col[$k]*$col[$k]; $sp+=$col[$k]*$err[$k]; } if ($xd==0) { print "Empty column.\n"; exit(2); } $lambda = $sp/$xd; if ($mode==1) { $lambda = $lambda * rand(); } open(OUT_ERR,">rg_err.txt"); $MSE=0; for ($k=0; $k<$obs; $k++) { $e_new = $err[$k] - $lambda * $col[$k]; $MSE+= $e_new * $e_new; print OUT_ERR "$e_new\n"; } close(OUT_ERR); $param[$l]+=$lambda; ## save resuls, compute resvar if (($iter %10 ==0)||($iter == $niter-1)) { $MSE = sqrt($MSE/$obs); $resvar = 1-$MSE/$MSE_init; if ($label ne "NOPRINT") { printf LOG "%s\t%d\t%d\t%f\t%f",$label,$iter,$l,$lambda,$resvar; for ($k=1; $k<$var; $k++) { printf LOG "\t%f",$param[$k]; } printf LOG "\n"; print "REGRESS $iter\t$resvar\n"; } } } close(LOG); return($resvar); } #============================================================ sub Regress_init { local($label)=@_; local($i,$k,*row); # use this routine when processing a new dataset # global: *param, obs, var, init $obs=0; $var=0; # detect number of var open(RG,"; @row=split(/\t/,$i); $var = $#row + 1; # number of elements in @row close(RG); # create one file for each var for ($k=0; $k<$var; $k++) { open($k,">rg_$k.txt"); } $obs=0; open(RG,") { @row=(); @row=split(/\t/,$i); for ($k=0; $k<$var; $k++) { $row[$k] =~ s/\n//g; printf $k "$row[$k]\n"; } $obs++; } close(RG); for ($k=0; $k<$var; $k++) { close($k); } print "INIT $obs observations / $var variables detected\n\n"; $init=1; } #============================================================ sub Submodels{ local($i,$j,$k,$l,$rho,$resvar,$lu,$dim,$MSE1); # need to run Regression before # must perform regression first # input: MSE_init, param, var open(COL0,"; close(COL0); open(LOG,">>rg_log.txt"); if ($var>0) { $dim=1; for ($i=1; $i<$var; $i++) { open(COL1,"; close(COL1); $obs=$#col1+1; $rho=0; for($l=0; $l<$obs; $l++) { $lu=$col0[$l] -$param[$i]*$col1[$l]; $rho+=$lu*$lu; } $MSE1=sqrt($rho/$obs); $rho=($MSE1-$MSE_init) / ($MSE-$MSE_init); $resvar=1-$MSE1/$MSE_init; print LOG "SUBMODELS\t$dim\t\t$rho\t$resvar"; for ($l=1; $l<$var; $l++) { if ($l==$i) { print LOG "\t$param[$l]"; } else { print LOG "\t0"; } } print LOG "\n"; } } if ($var>1) { $dim=2; for ($i=1; $i<$var; $i++) { open(COL1,"; close(COL1); for ($j=$i+1; $j<$var; $j++) { open(COL2,"; close(COL2); $obs=$#col1+1; $rho=0; for($l=0; $l<$obs; $l++) { $lu=$col0[$l] -$param[$i]*$col1[$l] -$param[$j]*$col2[$l]; $rho+=$lu*$lu; } $MSE1=sqrt($rho/$obs); $rho=($MSE1-$MSE_init) / ($MSE-$MSE_init); $resvar=1-$MSE1/$MSE_init; print LOG "SUBMODELS\t$dim\t\t$rho\t$resvar"; for ($l=1; $l<$var; $l++) { if (($l==$i)||($l==$j)) { print LOG "\t$param[$l]"; } else { print LOG "\t0"; } } print LOG "\n"; } } } if ($var>2) { $dim=3; for ($i=1; $i<$var; $i++) { open(COL1,"; close(COL1); for ($j=$i+1; $j<$var; $j++) { open(COL2,"; close(COL2); for ($k=$j+1; $k<$var; $k++) { open(COL3,"; close(COL3); $obs=$#col1+1; $rho=0; for($l=0; $l<$obs; $l++) { $lu=$col0[$l] -$param[$i]*$col1[$l] -$param[$j]*$col2[$l] -$param[$k]*$col3[$l]; $rho+=$lu*$lu; } $MSE1=sqrt($rho/$obs); $rho=($MSE1-$MSE_init) / ($MSE-$MSE_init); $resvar=1-$MSE1/$MSE_init; print LOG "SUBMODELS\t$dim\t\t$rho\t$resvar"; for ($l=1; $l<$var; $l++) { if (($l==$i)||($l==$j)||($l==$k)) { print LOG "\t$param[$l]"; } else { print LOG "\t0"; } } print LOG "\n"; } } } } close(LOG); }