Serious 'tr' bug, patch for review included

From: Andrey Chernov <ache_at_nagual.pp.ru>
Date: Fri, 1 Aug 2003 04:44:08 +0400
This patch address two problems.

1st one is relatively minor: according our own manpage, upper and lower 
classes must be sorted, but currently not.

2nd one is serious: 
	tr '[:lower:]' '[:upper:]'
(and vice versa) currently works only if upper and lower classes
have exact the same number of elements. When it is not true, like for
many ISO8859-x locales which have bigger amount of lowercase letters,
tr may do nasty things. The patch is complex, because whole conversion
string need to be processed each time l-u or u->l conversion occurse,
not single character at time, like in previous variant.

See this page
http://www.opengroup.org/onlinepubs/007908799/xcu/tr.html
for detailed description of desired tr behaviour in such cases.

Please test this patch on your system & locale and report me any strange 
things.

diff -u ./extern.h /usr/src/usr.bin/tr/extern.h
--- ./extern.h	Fri Jun 14 19:56:52 2002
+++ /usr/src/usr.bin/tr/extern.h	Fri Aug  1 04:19:36 2003
_at__at_ -40,7 +40,8 _at__at_
 
 typedef struct {
 	enum { STRING1, STRING2 } which;
-	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
+	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE,
+	       SET, SET_UPPER, SET_LOWER } state;
 	int	 cnt;			/* character count */
 	int	 lastch;		/* last character */
 	int	equiv[NCHARS];		/* equivalence set */
_at__at_ -49,3 +50,5 _at__at_
 } STR;
 
 int	 next(STR *);
+int charcoll(const void *, const void *);
+
diff -u ./str.c /usr/src/usr.bin/tr/str.c
--- ./str.c	Fri Jul  5 13:28:13 2002
+++ /usr/src/usr.bin/tr/str.c	Fri Aug  1 04:22:11 2003
_at__at_ -106,6 +106,8 _at__at_
 		}
 		return (1);
 	case SET:
+	case SET_UPPER:
+	case SET_LOWER:
 		if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
 			s->state = NORMAL;
 			return (next(s));
_at__at_ -194,7 +196,7 _at__at_
 {
 	int cnt, (*func)(int);
 	CLASS *cp, tmp;
-	int *p;
+	int *p, n;
 
 	tmp.name = s->str;
 	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
_at__at_ -208,10 +210,18 _at__at_
 		if ((func)(cnt))
 			*p++ = cnt;
 	*p = OOBCH;
+	n = p - cp->set;
 
 	s->cnt = 0;
-	s->state = SET;
 	s->set = cp->set;
+	if (strcmp(s->str, "upper") == 0)
+		s->state = SET_UPPER;
+	else if (strcmp(s->str, "lower") == 0) {
+		s->state = SET_LOWER;
+	} else
+		s->state = SET;
+	if ((s->state == SET_LOWER || s->state == SET_UPPER) && n > 1)
+		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
 }
 
 static int
diff -u ./tr.c /usr/src/usr.bin/tr/tr.c
--- ./tr.c	Thu Sep  5 03:29:07 2002
+++ /usr/src/usr.bin/tr/tr.c	Fri Aug  1 04:32:01 2003
_at__at_ -101,8 +101,9 _at__at_
 STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
 STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
 
-static int charcoll(const void *, const void *);
 static void setup(int *, char *, STR *, int, int);
+static void process_upper(int);
+static void process_lower(int);
 static void usage(void);
 
 int
_at__at_ -110,7 +111,7 _at__at_
 {
 	static int collorder[NCHARS], tmpmap[NCHARS];
 	int ch, cnt, lastch, *p;
-	int Cflag, cflag, dflag, sflag, isstring2;
+	int Cflag, cflag, dflag, sflag, isstring2, do_upper, do_lower;
 
 	(void)setlocale(LC_ALL, "");
 
_at__at_ -224,19 +225,67 _at__at_
 	if (!next(&s2))
 		errx(1, "empty string2");
 
-	ch = s2.lastch;
+	do_upper = do_lower = 0;
 	/* If string2 runs out of characters, use the last one specified. */
-	if (sflag)
-		while (next(&s1)) {
-			string1[s1.lastch] = ch = s2.lastch;
-			string2[ch] = 1;
-			(void)next(&s2);
-		}
-	else
-		while (next(&s1)) {
-			string1[s1.lastch] = ch = s2.lastch;
-			(void)next(&s2);
+	while (next(&s1)) {
+		if (s1.state == SET_LOWER &&
+		    s2.state == SET_UPPER) {
+			if (do_lower) {
+				process_lower(sflag);
+				do_lower = 0;
+			}
+			do_upper = 1;
+		} else if (s1.state == SET_UPPER &&
+			   s2.state == SET_LOWER) {
+			if (do_upper) {
+				process_upper(sflag);
+				do_upper = 0;
+			}
+			do_lower = 1;
+		} else {
+			if (do_lower) {
+				/* Skip until aligned */
+				if (s1.state == SET_UPPER) {
+					do {
+						if (!next(&s1))
+							goto endloop;
+					} while (s1.state == SET_UPPER);
+				} else if (s2.state == SET_LOWER) {
+					do {
+						if (!next(&s2))
+							break;
+					} while (s2.state == SET_LOWER);
+				}
+				process_lower(sflag);
+				do_lower = 0;
+			} else if (do_upper) {
+				/* Skip until aligned */
+				if (s1.state == SET_LOWER) {
+					do {
+						if (!next(&s1))
+							goto endloop;
+					} while (s1.state == SET_LOWER);
+				} else if (s2.state == SET_UPPER) {
+					do {
+						if (!next(&s2))
+							break;
+					} while (s2.state == SET_UPPER);
+				}
+				process_upper(sflag);
+				do_upper = 0;
+			}
+			string1[s1.lastch] = s2.lastch;
+			if (sflag)
+				string2[s2.lastch] = 1;
 		}
+		(void)next(&s2);
+	}
+endloop:
+	if (do_lower)
+		process_lower(sflag);
+	else if (do_upper)
+		process_upper(sflag);
+	/* End of upper & lower special processing */
 
 	if (cflag || Cflag) {
 		s2.str = argv[1];
_at__at_ -294,15 +343,55 _at__at_
 			string[cnt] = !string[cnt] && ISCHAR(cnt);
 }
 
-static int
+int
 charcoll(const void *a, const void *b)
 {
-	char sa[2], sb[2];
+	static char sa[2], sb[2];
 
 	sa[0] = *(const int *)a;
 	sb[0] = *(const int *)b;
-	sa[1] = sb[1] = '\0';
 	return (strcoll(sa, sb));
+}
+
+
+/*
+ * For -s result will contain only those characters defined
+ * as the second characters in each of the toupper or tolower
+ * pairs.
+ */
+
+static void
+process_upper(int sflag)
+{
+	int cnt, ch;
+
+	for (cnt = 0; cnt < NCHARS; cnt++) {
+		ch = string1[cnt];
+		if (ch == OOBCH)        /* [Cc]flag */
+			ch = cnt;
+		if (islower(ch)) {
+			string1[cnt] = ch = toupper(ch);
+			if (sflag && isupper(ch))
+				string2[ch] = 1;
+		}
+	}
+}
+
+static void
+process_lower(int sflag)
+{
+	int cnt, ch;
+
+	for (cnt = 0; cnt < NCHARS; cnt++) {
+		ch = string1[cnt];
+		if (ch == OOBCH)        /* [Cc]flag */
+			ch = cnt;
+		if (isupper(ch)) {
+			string1[cnt] = ch = tolower(ch);
+			if (sflag && islower(ch))
+				string2[ch] = 1;
+		}
+	}
 }
 
 static void
Received on Thu Jul 31 2003 - 15:44:13 UTC

This archive was generated by hypermail 2.4.0 : Wed May 19 2021 - 11:37:17 UTC