comfix.awk (1202B)
1 # when raw index has a lot of entries like 2 # 1578324 problematico, a, ci, che 3 # apply this algorithm: 4 # treat things after comma as suffixes 5 # for each suffix: 6 # if single letter, replace last letter 7 # else search backwards for beginning of suffix 8 # and if it leads to an old suffix of approximately 9 # the same length, put replace that suffix 10 # This will still leave some commas to fix by hand 11 # Usage: awk -F' ' -f comfix.awk rawindex > newrawindex 12 13 NF == 2 { 14 i = index($2, ",") 15 if(i == 0 || length($2) == 0) 16 print $0 17 else { 18 n = split($2, a, /,[ ]*/) 19 w = a[1] 20 printf "%s\t%s\n", $1, w 21 for(i = 2; i <= n; i++) { 22 suf = a[i] 23 m = matchsuflen(w, suf) 24 if(m) { 25 nw = substr(w, 1, length(w)-m) suf 26 printf "%s\t%s\n", $1, nw 27 } else 28 printf "%s\t%s\n", $1, w ", " suf 29 } 30 } 31 } 32 NF != 2 { 33 print $0 34 } 35 36 function matchsuflen(w, suf, wlen,suflen,c,pat,k,d) 37 { 38 wlen = length(w) 39 suflen = length(suf) 40 if(suflen == 1) 41 return 1 42 else { 43 c = substr(suf, 1, 1) 44 for (k = 1; k <= wlen ; k++) 45 if(substr(w, wlen-k+1, 1) == c) 46 break 47 if(k > wlen) 48 return 0 49 d = k-suflen 50 if(d < 0) 51 d = -d 52 if(d > 3) 53 return 0 54 return k 55 } 56 }