/***************************************************************************************************************** SAS file name: fuzzy.sas File location: __________________________________________________________________________________________________________________ Purpose: To demonstrate another fuzzy lookup technique using the SAS hash object Author: Peter Clemmensen Creation Date: 27/02/2021 This program supports the blog post "A Seven of Nine Fuzzy Matching Problem" on SASnrd.com *****************************************************************************************************************/ /* Example Data */ data one; input ssn \$9.; datalines; 123456789 987654321 121212121 343434343 ; data two; input ssn \$9.; datalines; 123456789 987645321 121212121 132456789 232323232 ; /* Proc SQL */ proc sql; create table matches as select a.ssn as ssn_one, b.ssn as ssn_two, case when (a.ssn ne b.ssn) then 'Not Exact' else 'Exact ' end as MatchResults from one a, two b where sum(char(a.ssn, 1) = char(b.ssn, 1), char(a.ssn, 2) = char(b.ssn, 2), char(a.ssn, 3) = char(b.ssn, 3), char(a.ssn, 4) = char(b.ssn, 4), char(a.ssn, 5) = char(b.ssn, 5), char(a.ssn, 6) = char(b.ssn, 6), char(a.ssn, 7) = char(b.ssn, 7), char(a.ssn, 8) = char(b.ssn, 8), char(a.ssn, 9) = char(b.ssn, 9) ) ge 7; quit; /* Hash Object */ data want(keep = ssn1 ssn2 MatchResults); array ss {2, 5000000} \$ 9 _temporary_; dcl hash h(hashexp : 20); h.definekey ("id", "n"); h.definedata("ssn1"); h.definedone(); dcl hash m(hashexp : 20); m.definekey("ssn1", "ssn2"); m.definedone(); do i = 1 to 8; do j = i + 1 to 9; do p = 1 to n1; if i = 1 and j = 2 then do; set one(rename = ssn = ssn1) nobs = n1; ss[1, p] = ssn1; end; else ssn1 = ss[1, p]; id = ssn1; substr(id, i, 1) = "*"; substr(id, j, 1) = "*"; do n = 1 by 1 until (h.check() ne 0); end; h.add(); end; do p = 1 to n2; if i = 1 and j = 2 then do; set two(rename = ssn = ssn2) nobs = n2; ss[2, p] = ssn2; end; else ssn2 = ss[2, p]; id = ssn2; substr(id, i, 1) = "*"; substr(id, j, 1) = "*"; do n = 1 by 1; if h.find() ne 0 then leave; if m.check() = 0 then leave; if ssn1 = ssn2 then MatchResults = "Y"; else MatchResults = "N"; output; m.add(); end; end; h.clear(); end; end; stop; run; /* Testing scalability on larger data */ data one two; length ssn \$ 9; do _N_ = 1 to 1e4; ssn = put (ceil (ranuni(1) * 1e9), z9.); output one; ssn = put (ceil (ranuni(1) * 1e9), z9.); output two; end; run; /* Proc SQL */ proc sql; create table matches as select a.ssn as ssn_one, b.ssn as ssn_two, case when (a.ssn ne b.ssn) then 'Not Exact' else 'Exact ' end as MatchResults from one a, two b where sum(char(a.ssn, 1) = char(b.ssn, 1), char(a.ssn, 2) = char(b.ssn, 2), char(a.ssn, 3) = char(b.ssn, 3), char(a.ssn, 4) = char(b.ssn, 4), char(a.ssn, 5) = char(b.ssn, 5), char(a.ssn, 6) = char(b.ssn, 6), char(a.ssn, 7) = char(b.ssn, 7), char(a.ssn, 8) = char(b.ssn, 8), char(a.ssn, 9) = char(b.ssn, 9) ) ge 7; quit; /* Hash Object */ data want(keep = ssn1 ssn2 MatchResults); array ss {2, 5000000} \$ 9 _temporary_; dcl hash h(hashexp : 20); h.definekey ("id", "n"); h.definedata("ssn1"); h.definedone(); dcl hash m(hashexp : 20); m.definekey("ssn1", "ssn2"); m.definedone(); do i = 1 to 8; do j = i + 1 to 9; do p = 1 to n1; if i = 1 and j = 2 then do; set one(rename = ssn = ssn1) nobs = n1; ss[1, p] = ssn1; end; else ssn1 = ss[1, p]; id = ssn1; substr(id, i, 1) = "*"; substr(id, j, 1) = "*"; do n = 1 by 1 until (h.check() ne 0); end; h.add(); end; do p = 1 to n2; if i = 1 and j = 2 then do; set two(rename = ssn = ssn2) nobs = n2; ss[2, p] = ssn2; end; else ssn2 = ss[2, p]; id = ssn2; substr(id, i, 1) = "*"; substr(id, j, 1) = "*"; do n = 1 by 1; if h.find() ne 0 then leave; if m.check() = 0 then leave; if ssn1 = ssn2 then MatchResults = "Exact "; else MatchResults = "Not Exact"; output; m.add(); end; end; h.clear(); end; end; stop; run; /* Verify that the results are identical */ proc compare base=matches comp=want;run;