SAS中的Jaro-Winkler字符串比较功能

Ric*_*ron 2 sas jaro-winkler

SAS中是否有Jaro-Winkler字符串比较的实现?

看起来Link King有Jaro-Winkler,但我更喜欢自己调用这个功能的灵活性.

谢谢!

cmj*_*hns 5

我知道没有jaro-winkler距离的内置函数.@Itzy已经引用了我所知道的唯一的.proc fcmp如果您愿意,可以使用自己的功能.我甚至会给你一个先于下面的代码.我只是试着按照维基百科上的文章.无论如何,它肯定不是Bill Winkler的strcmp.c文件的完美代表,并且可能有很多错误.

proc fcmp outlib=work.jaro.chars;

  subroutine jaromatch ( string1 $ , string2 $ , matchChars $);
    outargs matchChars;
    /* Returns number of matched characters between 2 strings excluding blanks*/
    /* two chars from string1 and string2 are considered matching
       if they are no farther than floor(max(|s1|, |s2|)/2)-1  */

    str1_len = length(strip(string1));
    str2_len = length(strip(string2));

    allowedDist = floor(max(str1_len, str2_len)/2) -1;

    matchChars="";

    /* walk through string 1 and match characters to string2 */
    do i= 1 to str1_len;
      x=substr(string1,i,1);
      position = findc(string2,x ,max(1,i-allowedDist));
      if position > 0 then do;
          if position - i <= allowedDist then do;
          y=substr(string2,position,1);
          /* build list of matched characters */
          matchChars=cats(matchChars,y);
        end;
      end;
    end;
    matchChars = strip(matchChars);
  endsub;


  function jarotrans (string1 $ , string2 $ );
    ntrans = 0;
    ubnd = min(length(strip(string1)), length(strip(string2)));
    do i = 1 to ubnd;
      if substr(string1,i,1) ne substr(string2,i,1) then do;
        ntrans + 1;
      end;
    end;
    return(ntrans/2);
  endsub;

  function getPrefixlen( string1 $ , string2 $, maxprelen);
     /* get the length of the matching characters at the beginning */
     n = min(maxprelen, length(string1), length(string2));
     do i = 1 to n;
       if substr(string1,i,1) ne substr(string2,i,1) 
       then return(max(1,i-1));
     end;
  endsub;

  function jarodist(string1 $, string2 $);
    /* get number of matched characters */
    call jaromatch(string1, string2, m1);
    m1_len = length(m1);
    if m1_len = 0 then return(0);
    call jaromatch(string2, string1, m2);
    m2_len = length(m2);
    if m2_len = 0 then return(0);

    /* get number of transposed characters */
    ntrans = jarotrans(m1, m2);
    put m1_len= m2_len= ntrans= ;
    j_dist =  (m1_len/length(string1) 
             + m2_len/length(string2) 
             + (m1_len-ntrans)/m1_len )  /  3;
    return(j_dist);
  endsub;

  function jarowink( string1 $, string2 $, prefixscale);
    jarodist=jarodist(string1, string2);
    prelen=getPrefixlen(string1, string2, 4);
    if prelen = 0 then return(jarodist);
    else  return(jarodist + prelen * prefixscale * (1-jarodist));
  endsub;

run;quit;

/* tell SAS where to find the functions we just wrote */
option cmplib=work.jaro;

/* Now let's try it out! */
data _null_;
string1='DIXON';
string2='DICKSONX';
x=jarodist(string1, string2);
y=jarowink(string1, string2, 0.1);
put x= y=;
run;
Run Code Online (Sandbox Code Playgroud)