/***************************************************************************************************************** SAS file name: array_hash.sas File location: __________________________________________________________________________________________________________________ Purpose: To demonstrate how to implement a hashing scheme with temporary arrays in the SAS Data Step. Author: Peter Clemmensen Creation Date: 15/12/2019 This program supports the blog post "An Array Hashing Scheme in SAS" on SASnrd.com *****************************************************************************************************************/ /* Small data set */ proc plan seed=123; factors k=1e6 / noprint; output out=k; run;quit; data small; set k; d=rand('integer', 1, 1e7); run; /* Large data set */ data large; call streaminit(123); do _N_=1 to 1e8; k=rand('integer', 1, 1e6); output; end; run; /* Prime number given load factor and number of obs in 'small' */ %let load = 0.8; data _null_; do p=ceil(p/&load) by 1 until (j = up + 1); up = ceil(sqrt(p)); do j=2 to up until (not mod(p,j)); end; end; call symput('hsize',left(put(p,best.))); stop; set small nobs=p; run; %put &hsize.; /* Do the match */ data arrayhash(keep=k d); array key {0 : &hsize.} _temporary_; array link {0 : &hsize.} _temporary_; array data {0 : &hsize.} _temporary_; r=&hsize.; do until (lr1); set small end=lr1; h=mod(k, &hsize.) + 1; /* 1 */ found=0; if link [h] > . then do; /* 2 */ link traverse; /* 3 */ if found then continue; do r=r by -1 until (link [r]=.); /* 4 */ end; link [h] = r; /* 5 */ h = r; end; link [h] = 0; /* 6 */ key [h] = k; data [h] = d; end; do until (lr2); set large end=lr2; found=0; h=mod(k, &hsize.) + 1; /* 1 */ if link[h] > . then link traverse; /* 2 */ if found then d = data[h]; /* 3 */ else d=.; output; end; traverse: if k = key [h] then found=1; else if link [h] ne 0 then do; h=link [h]; link traverse; end; run; /* Direct accessing */ data KeyIndex; array lookup {9999999} _temporary_; do until (lr1); set small end=lr1; lookup[k]=d; end; do until (lr2); set large end=lr2; d=.; d=lookup[k]; output; end; run; /* Hash lookup */ data hash(drop=rc); if _N_=1 then do; declare hash h(dataset:'small', hashexp:20); h.definekey('k'); h.definedata('d'); h.definedone(); end; set large; d=.; rc=h.find(); run; proc compare base=arrayhash comp=KeyIndex; run; proc compare base=arrayhash comp=hash; run;