/***************************************************************************************************************** SAS file name: fuzzy_grouping.sas File location: __________________________________________________________________________________________________________________ Purpose: To demonstrate how to use the hash object to solve fuzzy grouping problems in SAS. Author: Peter Clemmensen Creation Date: 23/12/2020 This program supports the blog post "Complicated Fuzzy Grouping With The SAS Hash Object" on SASnrd.com *****************************************************************************************************************/ /* 1 */ data have; input Name:$20.; datalines; Michael Michel kurt kirt Michaell Benjamin Mich ; data want(keep = name group); length name $20 comp $20 group 8; if _n_ = 1 then do; declare hash h(); h.definekey('comp'); h.definedata('comp', 'group'); h.definedone(); declare hiter hi('h'); declare hash hh(multidata:'y'); hh.definekey('group'); hh.definedata('group', 'comp'); hh.definedone(); _group=0; end; set have; rc = h.find(key:name); if rc ne 0 then do; rc = hi.first(); do while (rc=0); if complev(name, comp) le 2 then do; rc = hh.find(); do while (r ne 0); dist = complev(name, comp); hh.has_next(result : r); if r = 0 & dist <= 2 then do; h.add(key : name, data : name, data : group); hh.add(); output; return; end; else if r ne 0 & dist le 2 then rc = hh.find_next(); else if dist > 2 then leave; end; end; rc=hi.next(); end; _group ++ 1; group = _group; h.add(key : name, data : name, data : group); hh.add(key : group, data : group, data : name); end; output; run; /* 2 */ data have; infile datalines missover; input num_A num_B $ name $ 11-36 birth_date :ddmmyy10. id; format birth_date ddmmyy10.; datalines; 5785 fbff João Simões Marques 12/05/2000 7 1234 abcd M Rita Costa Santos 01/01/2020 1 3333 uvwx M Rita Costa Santos 01/01/2020 1 5678 efgh Maria Rita C Santos 01/01/2020 9101 ijkl Rita Costa Santos 01/01/2020 1 1111 mnop Maria Leonor Santos Silva 02/03/2001 2 2222 qrst Leonor Santos Silva 02/03/2001 4444 yzab Leonor Santos Silva 30/08/1999 6565 afgg Donald J Trump 01/01/1960 2423 sgty Donald J Trump 01/01/1960 9876 hgvb Pedro Costa Santos 05/09/1990 9 7865 jnbv Luís Miguel Silva 05/09/1990 ; data want(keep = num_A num_B name birth_date id); format num_A num_B name birth_date id; if _N_ = 1 then do; dcl hash h1 (); h1.definekey("name", "birth_date"); h1.definedata("i"); h1.definedone(); dcl hash h2 (multidata : "Y"); h2.definekey("birth_date"); h2.definedata("n", "i"); h2.definedone(); do until (z); set have(rename=(id=i name=n) where = (i)) end = z; h1.ref(); h2.ref(); maxid = max(maxid, i); end; end; set have; if id = . then do; if h1.find() ne 0 then do; do while (h2.do_over() = 0); if complev(name, n) < 10 then do; id = i; h1.ref(key : n, key : birth_date, data : id); end; end; end; else id = i; end; if id = . then do; maxid + 1; id = maxid; h1.ref(key : name, key : birth_date, data : id); end; run;