plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

comfix.awk (1202B)


      1 # when raw index has a lot of entries like
      2 # 1578324	problematico, a, ci, che
      3 # apply this algorithm:
      4 #  treat things after comma as suffixes
      5 #  for each suffix:
      6 #      if single letter, replace last letter
      7 #      else search backwards for beginning of suffix
      8 #      and if it leads to an old suffix of approximately
      9 #      the same length, put replace that suffix
     10 # This will still leave some commas to fix by hand
     11 # Usage: awk -F'	' -f comfix.awk rawindex > newrawindex
     12 
     13 NF == 2	{
     14 		i = index($2, ",")
     15 		if(i == 0 || length($2) == 0)
     16 			print $0
     17 		else {
     18 			n = split($2, a, /,[ ]*/)
     19 			w = a[1]
     20 			printf "%s\t%s\n", $1, w
     21 			for(i = 2; i <= n; i++) {
     22 				suf = a[i]
     23 				m = matchsuflen(w, suf)
     24 				if(m) {
     25 					nw = substr(w, 1, length(w)-m) suf
     26 					printf "%s\t%s\n", $1, nw
     27 				} else
     28 					printf "%s\t%s\n", $1, w ", " suf
     29 			}
     30 		}
     31 	}
     32 NF != 2 {
     33 	print $0
     34 	}
     35 
     36 function matchsuflen(w, suf,		wlen,suflen,c,pat,k,d)
     37 {
     38 	wlen = length(w)
     39 	suflen = length(suf)
     40 	if(suflen == 1)
     41 		return 1
     42 	else {
     43 		c = substr(suf, 1, 1)
     44 		for (k = 1; k <= wlen ; k++)
     45 			if(substr(w, wlen-k+1, 1) == c)
     46 				break
     47 		if(k > wlen)
     48 			return 0
     49 		d = k-suflen
     50 		if(d < 0)
     51 			d = -d
     52 		if(d > 3)
     53 			return 0
     54 		return k
     55 	}
     56 }