/*=============================================================================== Source code: C code: www.datashaping.com/regress3.c.txt (well documented, fast) Perl Code: www.datashaping.com/regress_pl.txt (not documented) Documentation: www.datashaping.com/software.shtml Help: www.datashaping.com/contact.shtml Developed by Vincent Granville, Data Shaping Solutions This header can not be deleted. As long as this condition is satisfied, this software can be distributed freely. Purpose: Linear and ridge regression based on iterative robust algorithm and boostrap to estimate parameters and compute confidence intervals for regression parameters. Efficiently handles large number of variables. ===============================================================================*/ /* The data is read from tab-separated ASCI file rg.txt; each column is saved into auxilary files rg_xx.txt. File rg.txt: data set; column 0 = dependent var Files rg_xx.txt are copies of individual columns File rg_err.txt: residual error per observation File rg_log.txt: program logs: this file contains all results (regression coefficients, empirical distribution for regression coefficients, etc.) The structure of rg_log.txt easily allows for parsing and further statistical processing: col 1: xxxxx_yy where xxxxx is the name of the procedure being run and yy is the iteration. col 2: iteration in the Regression call; set VERBOSE to -1 if you don't want this level of detail - then only results obtained at the final iteration are saved in rg_log.txt col 3: variable optimized at current iteration col 4: lambda (internal variable) col 5: reduction in standard deviation of error achieved at current iteration; this value is between 0 and 1; 0 corresponds to all regression coefficients set to zero; 1 means perfect fit cols 6, 7, 8 etc.: current value of regression coefficients *param: regression coefficients *param_seed: initial regression coefficients (usually 0) obs: number of observations var: number of columns in input file MSE: mean squared error MSE_init: MSE when regression coefficients are 0 init: flag to indicate that Regress_init has been called. seed: flag to indicate that Init_param has been called. */ #include #include #include #include #include #include #define VERBOSE 0 /* 0 for detailed logs, -1 for summary */ long init=-1; long obs=-1; long var=-1; long seed=-1; double MSE=0; double MSE_init=0; double *param; double *param_seed; int Init_param(long mode,long nvalid,long niter); int Regress_init(); int Bootstrap(long nsample,long mode,long niter); double Regress(long mode,long niter, char *label,long seed_flag); int Validation(long mode, long nvalid, long niter); int Create_rgfilename(long k, char *filename); int main() { double deltaTime; time_t start,finish; /* to meaure time elapsed */ system("del c:\\ftp\\IFDindex\\programs\\rg_*.txt"); time(&start); Regress_init(""); Validation(2,10,150); Init_param(2,5,4); Bootstrap(20,0,100); Regress(3,100,"REGRESSION",1); time(&finish); deltaTime=difftime(finish,start); printf("Time elapsed %lf sec.\n",deltaTime); if (seed==1) { free(param_seed); } if (init==1) { free(param); } } /*---------------------------------------------------------------------*/ int Validation(long mode, long nvalid, long niter) { /* Perform nvalid regressions with different starting points to see if all regressions produce the same results. If not, either you use too few iterations (increase niter up to 200) or your dataset has collinearity. In case of collinearity, this procedure will identify up to nvalid different optimum solutions to the problem. Input (global): *param, obs, var Input: mode: should be 2 (recommended) or 3; see Regression for description Input: nvalid: number of validations Input: niter: number of iterations to be used in regression Output: nvalid sets of regression coefficients in rg_log.txt */ long n; double nsvar; char label[128],digits[8]; label[127]='\0'; digits[7]='\0'; Regress_init("NOPRINT"); /* noprint eliminate output to rg_log.txt */ for (n=0; n< nvalid; n++) { printf("\nVALIDATION Test %ld\n\n",n); itoa(n,digits,10); strcpy(label,"VALIDATION_"); strcat(label,digits); nsvar=Regress(mode,niter,label,0); } } /*---------------------------------------------------------------------*/ int Init_param(long mode,long nvalid,long niter) { /* Performs nvalid approximate regression with different starting points to identify an initial approximate solution. This solution will be used as starting point (seed) for the regression. Input: mode: should be 2 (recommended) or 3; see Regression for details Input: nvalid: number of regressions to perform Input: niter: number of iterations to use in each regression; should be small here (<10) Input: mode: should be 2 (recommended) or 3; see Regression for description Output (global): seed is set to 1 to indicate that Regression must use param_seed Output: param_seed: regression coefficients to be used as starting point in Regression */ long k,n; double nsvar,nsvar_max; char label[128],digits[8]; label[127]='\0'; digits[7]='\0'; nsvar_max=-1; if (seed==1) { free(param_seed); } else { seed=1; } param_seed=(double *)calloc(var,2*sizeof(double)); Regress_init("NOPRINT"); for (n=0; n< nvalid; n++) { printf("\nINITPARAM Test %ld\n\n",n); itoa(n,digits,10); strcpy(label,"INITPARAM_"); strcat(label,digits); nsvar=Regress(mode,niter,label,0); if (nsvar>nsvar_max) { nsvar_max=nsvar; for (k=1; k0) { e_new+=param[k]*val; } /* k=0 is the dependent var */ if (k==0) { y=val; } k++; if (k==var) { k=0; fprintf(ERR,"%lf\n",y-e_new); e_new=0; } } fclose(ERR); fclose(RG); printf("\n"); } /* regression */ for (iter=0; iter< niter; iter++) { if ((mode == 0)||(mode ==3)) { l= 1 + (iter % (var - 1)); } else { l = 1 + (long)rand()%(var - 1); } Create_rgfilename(l,filename); COL=fopen(filename,"rt"); k=0; while (!feof(COL)) { fscanf(COL,"%s\n",stri); val=atof(stri); col[k]=val; k++; } fclose(COL); ERR=fopen("c:\\ftp\\IFDindex\\programs\\rg_err.txt","rt"); k=0; while (!feof(ERR)) { fscanf(ERR,"%s\n",stri); val=atof(stri); err[k]=val; k++; } fclose(ERR); xd=0; sp=0; for (k=0; k