/***************************************************************************************************************** SAS file name: array_hash_macro.sas File location: __________________________________________________________________________________________________________________ Purpose: To demonstrate how to use the array hash macros beautifully written by Richard A. DeVenezia Author: Peter Clemmensen Creation Date: 05/02/2020 This program supports the blog post "A SAS Macro Approach to Array Hashing" on SASnrd.com *****************************************************************************************************************/ /* Example data */ proc plan seed=123; factors k=1e6 / noprint; output out=k; run;quit; data small; set k; d=rand('integer', 1, 1e7); run; /* Large data set */ data large; call streaminit(123); do _N_=1 to 1e8; k=rand('integer', 1, 1e6); output; end; run; data small; do k=2, 4, 6, 8; d=k; output; end; run; data large; do k=1 to 10; output; end; run; /* A simple Many to One Array hashing lookup using macros */ %let hash_size = %hashSize (load=0.8, data=small); %put &hash_size.; data arrayhash (keep=k d); %declareHash (h, &hash_size, policy=CL); %declareHashKey (h, k); %declareHashData (h, d); do until (lr1); set small end=lr1; %hashAddKey (h, rc=rc); %hashSetData (h, d); *temp = catq('d', '|', of h_keys[*]); *put temp=; end; do until (lr2); set large end=lr2; call missing (d); %hashFetch_CL(h, rc=rc); if rc=0 then d = %hashGetData (h, d); output; end; run; %let load = 0.8; data _null_; do p=ceil(p/&load) by 1 until (j = up + 1); up = ceil(sqrt(p)); do j=2 to up until (not mod(p,j)); end; end; call symput('hsize',left(put(p,best.))); stop; set small nobs=p; run; %put &hsize.; data arrayhash2(keep=k d); array key {0 : &hsize.} _temporary_; array link {0 : &hsize.} _temporary_; array data {0 : &hsize.} _temporary_; r=&hsize.; do until (lr1); set small end=lr1; h=mod(k, &hsize.) + 1; /* 1 */ found=0; if link [h] > . then do; /* 2 */ link traverse; /* 3 */ if found then continue; do r=r by -1 until (link [r]=.); /* 4 */ end; link [h] = r; /* 5 */ h = r; end; link [h] = 0; /* 6 */ key [h] = k; data [h] = d; end; do until (lr2); set large end=lr2; found=0; h=mod(k, &hsize.) + 1; /* 1 */ if link[h] > . then link traverse; /* 2 */ if found then d = data[h]; /* 3 */ else d=.; output; end; traverse: if k = key [h] then found=1; else if link [h] ne 0 then do; h=link [h]; link traverse; end; run; /* Canned hash for comparison */ data test (keep=k d); if _N_=1 then do; declare hash h(dataset:"small"); h.definekey("k"); h.definedata("d"); h.definedone(); end; set large; call missing(d); rc=h.find(); run; /* The Array Hasinh Macros */ %macro hashSize (n=, data=, load = 0.5) ; /* * n - an apriori determined n * data - let n be number of rows in data * load - load factor * * determine size of array that will contain a hash of N items * at a given load factor. * * Sugi 26 Paper 128-26, Paul M. Dorfman * QUICK DISK TABLE LOOKUP * VIA HYBRID INDEXING INTO A * DIRECTLY ADDRESSED SAS . DATA SET */ %if %length(&data) %then %do; %local dsid; %let dsid = %sysfunc(open(&data)); %if &dsid %then %do; %let n = %sysfunc (attrn(&dsid, NOBS)); %let dsid=%sysfunc(close(&dsid)); %end; %else %do; %put ERROR: hash_size could not open &data. Forcing an error; ;DATA_NOT_FOUND; %end; %end; %local ip up pr ; %let pr = %sysfunc (floor(&n/&load)) ; %do %until ( &ip > &up ) ; %let pr = %eval (&pr + 1) ; %let up = %sysfunc(ceil(%sysfunc(sqrt(&pr)))) ; %if %sysfunc(mod(&pr,2)) = 0 %then %goto failed ; %do ip = 3 %to &up %by 2; %if %sysfunc(mod(&pr,&ip)) = 0 %then %goto failed ; %end ; %failed: %end ; &pr %mend ; %macro declareHash (hash, size, policy=LP, allowDuplicates=NO); %let policy = %upcase(&policy); %let allowDuplicates = %upcase (&allowDuplicates); %if (&policy ne LP and &policy ne CL) %then %do; %put ERROR: Hash collision resolution policy must be LP (Linear Probe) or CL (Coalesced Linking); ;UNKNOWN_POLICY; %goto EndMacro; %end; %if &allowDuplicates ne YES %then %let allowDuplicates = NO; %global hash_size_&hash; %global hash_policy_&hash; %global hash_dups_&hash; %global hash_add_count_&hash; %global hash_lp_increment_&hash; %let hash_size_&hash = &size; %let hash_policy_&hash = &policy; %let hash_dups_&hash = &allowDuplicates; %let hash_add_count_&hash = 1; %let hash_lp_increment_&hash = 3; array &hash._keys (0:&size) _temporary_; %if &policy eq CL %then %do; array &hash._chain (0:&size) _temporary_; retain &hash._chain_addr &size; %end; retain &hash._max_depth 0; %EndMacro: %mend; %macro declareHashKey (hash, key); %global hash_key_&hash; %let hash_key_&hash = &key; length &hash._key_addr 8; %mend; %macro declareHashData (hash, var); %local size; %let size = &&hash_size_&hash; array &hash._&var (0:&size) _temporary_; %mend; %macro hashAddKey (hash, rc=); %* PDV variable named in rc will receive -1 if duplicate key %* found when allowDuplicates = NO; %hashAddKey_&&hash_policy_&hash (&hash, rc=&rc) %mend; %macro hashAddKey_CL (hash, rc=); %local size key count key_addr chain chain_addr keys; %let size = &&hash_size_&hash; %let key = &&hash_key_&hash; %let count = &&hash_add_count_&hash; %let hash_add_count&hash = %eval ( 1 + &count ); %let key_addr = &hash._key_addr; %let chain = &hash._chain; %let chain_addr = &hash._chain_addr; %let keys = &hash._keys; %let end_block = &hash._addkey_end_block_&count; *--------------------; %if %length (&rc) %then &rc = 0;; &key_addr = mod ( &key, &size ) + 1; if &chain ( &key_addr ) ne . then do; %if &&hash_dups_&hash = YES %then %do; %* allowing dup keys, proceed to end of chain; do while ( &chain ( &key_addr ) ne 0 ); &key_addr = &chain ( &key_addr ); end; %end; %else %do; %* disallowing duplicate keys; %* thus quit code block if key found; do while ( 1 ) ; if ( &key = &keys ( &key_addr ) ) then do; %if %length (&rc) %then &rc = -1;; goto &end_block; end; %* stop searching if end of chain reached; if ( &chain ( &key_addr ) = 0 ) then leave; &key_addr = &chain ( &key_addr ); end; %end; %* &key_addr is at the end of a chain %* (where &chain (&key_addr) is 0); %* locate open addr for new end of chain; do &chain_addr = &chain_addr by -1 until ( &chain ( &chain_addr ) = . ); end; %* point to new end of chain and update key addr ; &chain ( &key_addr ) = &chain_addr; &key_addr = &chain_addr; end; %* set end of chain marker and set hash validator; &chain ( &key_addr ) = 0; &keys ( &key_addr ) = &key; &end_block: ; *--------------------; %mend; %macro hashAddKey_LP (hash, rc=); %local size key count increment key_addr keys end_block depth max_depth bound_check; %let size = &&hash_size_&hash; %let key = &&hash_key_&hash; %let count = &&hash_add_count_&hash; %let increment = &&hash_lp_increment_&hash; %let hash_add_count&hash = %eval ( 1 + &count ); %let key_addr = &hash._key_addr; %let keys = &hash._keys; %let end_block = &hash._addkey_end_block_&count; %let depth = &hash._depth; %let max_depth = &hash._max_depth; %if &increment = 0 %then %do; %put ERROR: Linear probe increment must be non-zero; ;CauseError; %goto EndMacro; %end; %else %if &increment > 0 %then %let bound_check = if ( &key_addr > &size ) then &key_addr + ( - &size ); %else %let bound_check = if ( &key_addr < 0 ) then &key_addr + ( + &size ); *--------------------; %if %length (&rc) %then &rc = 0;; &key_addr = mod ( &key, &size ) + 1; if ( &keys ( &key_addr ) ne . ) then do; &depth = 0; do while (1); %if &&hash_dups_&hash = NO %then %do; if ( &key = &keys ( &key_addr ) ) then do; %if %length (&rc) %then &rc = -1;; goto &end_block; end; %end; if ( &keys ( &key_addr ) = . ) then leave; &key_addr + ( &increment ); &bound_check; &depth + 1; end; if ( &depth > &max_depth ) then &max_depth = &depth; end; &keys ( &key_addr ) = &key; &end_block: ; *--------------------; %EndMacro: %mend; %macro keyArray (hash); &hash._keys %mend; %macro chainArray (hash); &hash._chain %mend; %macro dataArray (hash, var); &hash._&var %mend; %macro hashGetKey (hash); %keyArray(&hash) ( &hash._key_addr ) %mend; %macro hashGetData (hash, var); %dataArray(&hash,&var) ( &hash._key_addr ) %mend; %macro hashSetData (hash, var); %dataArray(&hash,&var) ( &hash._key_addr ) = &var; %mend; %macro hashKey (hash); &&hash_key_&hash %mend; %macro hashKeyAddr (hash); &hash._key_addr %mend; %macro hashFetch (hash, rc=); %* PDV variable named in rc will receive -1 if key not found %* and will receive 0 if key found; %hashFetch_&&hash_policy_&hash (&hash, rc=&rc) %mend; %macro hashFetchNext (hash, rc=); %* PDV variable named in rc will receive -1 if key not found %* and will receive 0 if key found; %hashFetchNext_&&hash_policy_&hash (&hash, rc=&rc) %mend; %macro hashFetch_CL (hash, rc); %local size key key_addr chain keys; %let size = &&hash_size_&hash; %let key = &&hash_key_&hash; %let key_addr = &hash._key_addr; %let chain = &hash._chain; %let keys = &hash._keys; %if %length (&rc) %then &rc = -1;; &key_addr = mod ( &key, &size ) + 1; if &chain ( &key_addr ) ne . then do; do while ( 1 ) ; if ( &key = &keys ( &key_addr ) ) then do; %if %length (&rc) %then &rc = 0;; leave; end; if ( &chain ( &key_addr ) = 0 ) then do; &key_addr = -1; leave; end; &key_addr = &chain ( &key_addr ); end; end; else &key_addr = -1; %mend; %macro hashFetchNext_CL (hash, rc); %if &&hash_dups_&hash = NO %then %do; %put ERROR: hashFetchNext can only be called for hashes declared with allowDuplicates=YES; %put ERROR: hash [&hash] was not declared thus.; ;CauseError; %goto EndMacro; %end; %local size key key_addr chain ; %let size = &&hash_size_&hash; %let key = &&hash_key_&hash; %let key_addr = &hash._key_addr; %let chain = &hash._chain; *--------------------; %if %length (&rc) %then &rc = -1;; if &key_addr > -1 then do; &key_addr = &chain ( &key_addr ); if &chain ( &key_addr ) ne . then do; do while ( 1 ) ; if ( &key = &keys ( &key_addr ) ) then do; %if %length (&rc) %then &rc = 0;; leave; end; %* stop searching if end of chain reached; if ( &chain ( &key_addr ) = 0 ) then do; &key_addr = -1; leave; end; &key_addr = &chain ( &key_addr ); end; end; else &key_addr = -1; end; *--------------------; %EndMacro: %mend; %macro hashFetch_LP (hash, rc); %local size key increment key_addr keys depth max_depth bound_check; %let size = &&hash_size_&hash; %let key = &&hash_key_&hash; %let increment = &&hash_lp_increment_&hash; %let key_addr = &hash._key_addr; %let keys = &hash._keys; %let depth = &hash._depth; %let max_depth = &hash._max_depth; %if &increment > 0 %then %let bound_check = if ( &key_addr > &size ) then &key_addr + ( - &size ); %else %let bound_check = if ( &key_addr < 0 ) then &key_addr + ( + &size ); *--------------------; %if %length (&rc) %then &rc = -1;; &key_addr = mod ( &key, &size ) + 1; if &keys ( &key_addr ) ne . then do; &depth = 0; do while ( 1 ) ; if ( &key = &keys ( &key_addr ) ) then do; %if %length (&rc) %then &rc = 0;; leave; end; %* stop searching if end of probe reached; if ( &keys ( &key_addr ) = . or &depth > &max_depth ) then do; &key_addr = -1; leave; end; &key_addr + ( &increment ); &bound_check; &depth + 1; end; end; else &key_addr = -1; *--------------------; %mend; %macro hashFetchNext_LP (hash, rc); %local size key increment key_addr keys depth max_depth bound_check; %let size = &&hash_size_&hash; %let key = &&hash_key_&hash; %let increment = &&hash_lp_increment_&hash; %let key_addr = &hash._key_addr; %let keys = &hash._keys; %let depth = &hash._depth; %let max_depth = &hash._max_depth; %if &increment > 0 %then %let bound_check = if ( &key_addr > &size ) then &key_addr + ( - &size ); %else %let bound_check = if ( &key_addr < 0 ) then &key_addr + ( + &size ); %if %length (&rc) %then &rc = -1;; if ( &key_addr > -1 ) then do; do while ( 1 ) ; &key_addr + ( &increment ); &bound_check; &depth + 1; if ( &key = &keys ( &key_addr ) ) then do; %if %length (&rc) %then &rc = 0;; leave; end; %* stop searching if end of probe reached; if ( &keys ( &key_addr ) = . or &depth > &max_depth ) then do; &key_addr = -1; leave; end; end; end; %mend;