/***************************************************************************************************************** SAS file name: top_n_hashofhashes.sas File location: __________________________________________________________________________________________________________________ Purpose: To demonstrate how to create top N lists by group using the hash of hashes technique. Author: Peter Clemmensen Creation Date: 22/12/2020 This program supports the blog post "Top N By Group with Hash of Hashes in SAS" on SASnrd.com *****************************************************************************************************************/ /* Clean log and output */ dm log "clear"; dm output "clear"; /* Example data */ proc sql; create table iris as select * from sashelp.iris order by rand('uniform'); quit; /* Top 10 by group */ data want1; dcl hash hoh(ordered : "Y"); hoh.definekey("species"); hoh.definedata("h", "hi", "species"); hoh.definedone(); dcl hiter i("hoh"); do until(z); set iris end = z; if hoh.find() ne 0 then do; dcl hash h(dataset : "iris(obs=0)", multidata : "Y", ordered : "D"); h.definekey("sepallength"); h.definedata(all : "Y"); h.definedone(); dcl hiter hi("h"); hoh.add(); end; h.add(); end; do while(i.next() = 0); do _N_ = 1 by 1 while(hi.next() = 0 & _N_ <= 10); output; end; end; run; /* More Memory Efficient */ data want2; dcl hash hoh(ordered : "Y"); hoh.definekey("species"); hoh.definedata("h", "hi", "species"); hoh.definedone(); dcl hiter i("hoh"); do until(z); set iris end = z; if hoh.find() ne 0 then do; dcl hash h(dataset : "iris(obs=0)", multidata : "Y", ordered : "D"); h.definekey("sepallength"); h.definedata(all : "Y"); h.definedone(); dcl hiter hi("h"); hoh.add(); end; h.add(); if h.num_items > 10 then do; _N_ = hi.last(); _N_ = hi.next(); _N_ = h.find(); do while(1); _N_ = h.has_next(result : next); if not next then leave; _N_ = h.find_next(); end; _N_ = h.removedup(); end; end; do while(i.next() = 0); do _N_ = 1 by 1 while(hi.next() = 0 & _N_ <= 10); output; end; end; drop next; run; proc compare base = want1 comp = want2; run; /* Compare performance */ /* Large example data */ data have; do _N_ = 1 to 1e7; k = ceil(rand('uniform')*100); d = ceil(rand('uniform')*1e6); output; end; run; /* Hash of hashes - All data will eventually be read into memory */ data want1; dcl hash hoh(ordered : "Y"); hoh.definekey("k"); hoh.definedata("h", "hi", "k"); hoh.definedone(); dcl hiter i("hoh"); before = input(getoption('xmrlmem'),20.); do until(z); set have end = z; if hoh.find() ne 0 then do; dcl hash h(multidata : "Y", ordered : "D"); h.definekey("d"); h.definedata("k", "d"); h.definedone(); dcl hiter hi("h"); hoh.add(); end; h.add(); end; after = input(getoption('xmrlmem'),20.); hashsize = before-after; put hashsize= sizekmg10.2; do while(i.next() = 0); do _N_ = 1 by 1 while(hi.next() = 0 & _N_ <= 10); output; end; end; drop before after hashsize; run; /* Memory management */ data want2; dcl hash hoh(ordered : "Y"); hoh.definekey("k"); hoh.definedata("h", "hi", "k"); hoh.definedone(); dcl hiter i("hoh"); before = input(getoption('xmrlmem'),20.); do until(z); set have end = z; if hoh.find() ne 0 then do; dcl hash h(multidata : "Y", ordered : "D"); h.definekey("d"); h.definedata("k", "d"); h.definedone(); dcl hiter hi("h"); hoh.add(); end; h.add(); if h.num_items > 10 then do; _N_ = hi.last(); _N_ = hi.next(); _N_ = h.find(); do while(1); _N_ = h.has_next(result : next); if not next then leave; _N_ = h.find_next(); end; _N_ = h.removedup(); end; end; after = input(getoption('xmrlmem'),20.); hashsize = before-after; put hashsize= sizekmg10.2; do while(i.next() = 0); do _N_ = 1 by 1 while(hi.next() = 0 & _N_ <= 10); output; end; end; drop before after hashsize next; run; /* Verify that the results are identical */ proc compare base = want1 comp = want2; run;