/***************************************************************************************************************** SAS file name: array_hashing_2.sas File location: __________________________________________________________________________________________________________________ Purpose: To elaborate further on the array hasing technique Author: Peter Clemmensen Creation Date: 10/10/2020 This program supports the blog post "Another Word on Array Hashing in SAS" on SASnrd.com *****************************************************************************************************************/ /* Clear log */ dm log "clear"; /* Delete work library */ proc datasets lib=work kill nolist; run; /* Example data */ data large(keep=k1 k2) small(keep=k1 k2 d1 d2); call streaminit(123); array r {5000000} _temporary_ (1 : 5000000); h = 5000000; do _N_ = 1 to 5000000; i = rand ("integer", h); k1 = r [i]; k2 = rand('integer', 1, 1e4); d1 = rand('integer', 1, 1e8); d2 = rand('integer', 1, 1e8); output large; if rand('uniform') < .5 then output small; r [i] = r [h]; h = h-1; end; stop; run; /* Hash Object */ data hash1(keep=k1 d1 d2); if _N_ = 1 then do; dcl hash h(dataset : 'small'); h.definekey('k1'); h.definedata('d1', 'd2'); h.definedone(); end; set large; d1 = .; d2 = .; rc = h.find(); run; /* Find a fitting prime */ %let load = 0.5; data _null_; do p=ceil(p/&load) by 1 until (j = up + 1); up = ceil(sqrt(p)); do j=2 to up until (not mod(p,j)); end; end; call symput('hsize',left(put(p,best.))); stop; set small nobs=p; run; %put &hsize.; /* Array hashing with multiple data variables */ data arrayhash1(keep=k1 d1 d2); array key {0 : &hsize.} _temporary_; array data1 {0 : &hsize.} _temporary_; array data2 {0 : &hsize.} _temporary_; array link {0 : &hsize.} _temporary_; r=&hsize.; do until (lr1); set small end=lr1; h=mod(k1, &hsize.) + 1; found=0; if link [h] > . then do; link traverse; if found then continue; do r=r by -1 until (link [r]=.); end; link [h] = r; h = r; end; link [h] = 0; key [h] = k1; data1 [h] = d1; data2 [h] = d2; end; do until (lr2); set large end=lr2; found=0; h=mod(k1, &hsize.) + 1; if link[h] > . then link traverse; if found then do; d1 = data1[h]; d2 = data2[h]; end; else call missing(d1, d2); output; end; stop; traverse: if k1 = key [h] then found=1; else if link [h] ne 0 then do; h=link [h]; link traverse; end; run; /* Test if the restuls are identical */ proc compare base=hash1 comp=arrayhash1;run; %put &sysinfo.; /* Multiple Keys */ /* Hash Object */ data hash2(keep=k1 k2 d1 d2); if _N_ = 1 then do; dcl hash h(dataset : 'small'); h.definekey('k1', 'k2'); h.definedata('d1', 'd2'); h.definedone(); end; set large; d1 = .; d2 = .; rc = h.find(); run; /* Find a fitting prime */ %let load = 0.5; data _null_; do p=ceil(p/&load) by 1 until (j = up + 1); up = ceil(sqrt(p)); do j=2 to up until (not mod(p,j)); end; end; call symput('hsize',left(put(p,best.))); stop; set small nobs=p; run; %put &hsize.; /* Array hashing with multiple data variables */ data arrayhash2(keep=k1 k2 d1 d2) / stack = 100; array key1 {0 : &hsize.} _temporary_; array key2 {0 : &hsize.} _temporary_; array data1 {0 : &hsize.} _temporary_; array data2 {0 : &hsize.} _temporary_; array link {0 : &hsize.} _temporary_; r=&hsize.; do until (lr1); set small end=lr1; h = mod(mod(k1, &hsize.) * 1e4 + k2, &hsize.); found=0; if link [h] > . then do; link traverse; if found then continue; do r=r by -1 until (link [r]=.); end; link [h] = r; h = r; end; link [h] = 0; key1 [h] = k1; key2 [h] = k2; data1 [h] = d1; data2 [h] = d2; end; do until (lr2); set large end=lr2; found=0; h = mod(mod(k1, &hsize.) * 1e4 + k2, &hsize.); if link[h] > . then link traverse; if found then do; d1 = data1[h]; d2 = data2[h]; end; else call missing(d1, d2); output; end; stop; traverse: if k1 = key1 [h] and k2 = key2 [h] then found=1; else if link [h] ne 0 then do; h=link [h]; link traverse; end; run; /* Test if the restuls are identical */ proc compare base=hash2 comp=arrayhash2;run; %put &sysinfo.;