/***************************************************************************************************************** SAS file name: Discrete_Dist_Fit.sas File location: _________________________________________________________________________________________________________________ Purpose: Fit discrete distributions to univariate data using PROC GENMOD Author: Peter Clemmensen Creation Date: 12/07/2017 This program supports the blog post "Fit Discrete Distributions to Univariate Data" on SASnrd.com *****************************************************************************************************************/ /* Some small count dataset */ data CountData; input count @@; datalines; 5 8 5 6 3 3 6 4 8 9 5 2 6 6 6 3 6 7 5 4 1 7 7 6 5 5 4 4 4 4 5 8 8 5 9 10 4 9 5 3 7 6 13 2 4 12 11 12 7 7 2 3 2 3 3 ; /* Tabulate counts and plot data */ proc freq data=CountData noprint; tables count / out=CountMinMax; run; data _null_; set CountMinMax end=eof; if _N_=1 then call symputx('minCount', count); if eof then call symputx('maxCount', count); run; %put min=&minCount max=&maxCount; /* Visualize the data */ title 'Frequency Plot of Count Dataset'; proc sgplot data=CountData; vbar count; xaxis display=(nolabel); yaxis display=(nolabel); run; title; /* Fit Poisson distribution to data */ proc genmod data=CountData; model count= /dist=Poisson; /* No variables are specified, only mean is estimated */ output out=PoissonFit p=lambda; run; /* Save Poisson parameter lambda in macro variables */ data _null_; set PoissonFit(obs=1); call symputx('lambda', lambda); run; /* Use Min/Max values and the fitted Lambda to create theoretical Poisson Data */ data TheoreticalPoisson; do count=&minCount to &maxCount; po=pdf('Poisson', count, &lambda); output; end; run; /* Negative Binomial Example */ /* Fit Negative Binomial distribution to data */ proc genmod data=CountData; model count= /dist=NegBin; /* No variables are specified, only mean is estimated */ ods output parameterestimates=NegBinParameters; run; /* Transpose Data */ proc transpose data=NegBinParameters out=NegBinParameters; var estimate; id parameter; run; /* Calculate k and p from intercept and dispersion parameters */ data NegBinParameters; set NegBinParameters; k = 1/dispersion; p = 1/(1+exp(intercept)*dispersion); run; /* Save k and p in macro variables */ data _null_; set NegBinParameters; call symputx('k', k); call symputx('p', p); run; /* Calculate theoretical Negative Binomial PMF based on fitted k and p */ data TheoreticalNegBin; do count=&minCount to &maxCount; NegBin=pdf('NegBinomial', count, &p, &k); output; end; run; /* Merge The datasets for plotting */ data PlotData(keep=count freq po negbin); merge TheoreticalPoisson TheoreticalNegBin CountMinMax; by count; freq = PERCENT/100; run; /* Overlay fitted Poisson density with original data */ title 'Count data overlaid with fitted distributions'; proc sgplot data=PlotData noautolegend; vbarparm category=count response=freq / legendlabel='Count Data'; series x=count y=po / markers markerattrs=(symbol=circlefilled color=red) lineattrs=(color=red)legendlabel='Fitted Poisson PMF'; series x=count y=NegBin / markers markerattrs=(symbol=squarefilled color=green) lineattrs=(color=green)legendlabel='Fitted Negative Binomial PMF'; xaxis display=(nolabel); yaxis display=(nolabel); keylegend / location=inside position=NE across=1; run; title;