commit 95220bf88775deab4a037264d08b21bacc612d70
parent 3850e6e177677885074c8896ef24534894726ad5
Author: sean <phonologus@gmail.com>
Date:   Thu, 21 May 2020 16:10:30 +0100
ed: handle Unicode beyond the BMP correctly in list mode.
List mode was constrained to the BMP. This change introduces
the following new list mode convention, using Go string literal syntax:
Non-printing ASCII characters display as \xhh.
Non-ASCII characters in the BMP display as \uhhhh.
Characters beyond the BMP display as \Uhhhhhhhh.
Diffstat:
2 files changed, 42 insertions(+), 11 deletions(-)
diff --git a/man/man1/ed.1 b/man/man1/ed.1
@@ -441,10 +441,18 @@ a backspace as
 .LR \eb ,
 backslashes as
 .LR \e\e ,
-and non-printing characters as
+and non-printing ASCII characters as
 a backslash, an
 .LR x ,
-and four hexadecimal digits.
+and two hexadecimal digits.
+non-ASCII characters in the Basic Multilingual Plane
+are printed as a backslash, a small
+.LR u ,
+and four hexadecimal digits; and characters above the
+Basic Multilingual Plane are printed as a backslash,
+a big
+.LR U ,
+and six hexadecimal digits.
 Long lines are folded,
 with the second and subsequent sub-lines indented one tab stop.
 If the last character in the line is a blank,
diff --git a/src/cmd/ed.c b/src/cmd/ed.c
@@ -21,6 +21,12 @@ enum
 	EOF	= -1
 };
 
+enum
+{
+	LINELEN = 70,	/* max number of glyphs in a display line */
+	BELL = 6	/* A char could require up to BELL glyphs to display */
+};
+
 void	(*oldhup)(int);
 void	(*oldquit)(int);
 int*	addr1;
@@ -40,7 +46,7 @@ int	ichanged;
 int	io;
 Biobuf	iobuf;
 int	lastc;
-char	line[70];
+char	line[LINELEN];
 Rune*	linebp;
 Rune	linebuf[LBSIZE];
 int	listf;
@@ -1543,7 +1549,7 @@ putchr(int ac)
 				*lp++ = 'n';
 			}
 		} else {
-			if(col > (72-6-2)) {
+			if(col > (LINELEN-BELL)) {
 				col = 8;
 				*lp++ = '\\';
 				*lp++ = '\n';
@@ -1558,15 +1564,32 @@ putchr(int ac)
 				if(c == '\t')
 					c = 't';
 				col++;
-			} else
-			if(c<' ' || c>='\177') {
+			} else if (c<' ' || c=='\177') {
 				*lp++ = '\\';
 				*lp++ = 'x';
-				*lp++ =  hex[c>>12];
-				*lp++ =  hex[c>>8&0xF];
-				*lp++ =  hex[c>>4&0xF];
-				c     =  hex[c&0xF];
+				*lp++ = hex[(c>>4)&0xF];
+				c     = hex[c&0xF];
+				col += 3;
+			} else if (c>'\177' && c<=0xFFFF) {
+				*lp++ = '\\';
+				*lp++ = 'u';
+				*lp++ = hex[(c>>12)&0xF];
+				*lp++ = hex[(c>>8)&0xF];
+				*lp++ = hex[(c>>4)&0xF];
+				c     = hex[c&0xF];
 				col += 5;
+			} else if (c>0xFFFF) {
+				*lp++ = '\\';
+				*lp++ = 'U';
+				*lp++ = hex[(c>>28)&0xF];
+				*lp++ = hex[(c>>24)&0xF];
+				*lp++ = hex[(c>>20)&0xF];
+				*lp++ = hex[(c>>16)&0xF];
+				*lp++ = hex[(c>>12)&0xF];
+				*lp++ = hex[(c>>8)&0xF];
+				*lp++ = hex[(c>>4)&0xF];
+				c     = hex[c&0xF];
+				col += 9;
 			}
 		}
 	}
@@ -1574,7 +1597,7 @@ putchr(int ac)
 	rune = c;
 	lp += runetochar(lp, &rune);
 
-	if(c == '\n' || lp >= &line[sizeof(line)-5]) {
+	if(c == '\n' || lp >= &line[LINELEN-BELL]) {
 		linp = line;
 		write(oflag? 2: 1, line, lp-line);
 		return;