Wednesday, May 25, 2011

Re: [BUG] 'non-empty string' >? '' returns false on amd64 arch

diff --git a/src/mbyte.c b/src/mbyte.c
index ce0c897..1e9d253 100644
--- a/src/mbyte.c
+++ b/src/mbyte.c
@@ -3079,6 +3079,143 @@ utf_isupper(a)
return (utf_tolower(a) != a);
}

+ static int
+utf_strnicmp(s1, s2, n1, n2)
+ char_u *s1, *s2;
+ size_t n1, n2;
+{
+ int c1, c2, k;
+ char_u buffer[6];
+
+ for (;;)
+ {
+ /* Try to decode next utf8 character sequence from s1,
+ * put its codepoint into c1, and advance s1.
+ * On errors sets c1=-1, on end of string sets c1=0. */
+ if (n1 != 0)
+ {
+ k = utf8len_tab_zero[*s1];
+ if (k == 1)
+ {
+ c1 = *s1++; /* ascii, possibly NUL */
+ n1--;
+ }
+ else if (k <= n1)
+ {
+ c1 = utf_ptr2char(s1);
+ if (c1 >= 256)
+ {
+ s1 += k; /* success */
+ n1 -= k;
+ }
+ else
+ {
+ c1 = -1; /* incomplete or illegal */
+ }
+ }
+ else
+ {
+ c1 = -1; /* incomplete */
+ }
+ }
+ else
+ {
+ c1 = 0; /* end of string */
+ }
+
+ /* Do the same for the other string */
+ if (n2 != 0)
+ {
+ k = utf8len_tab_zero[*s2];
+ if (k == 1)
+ {
+ c2 = *s2++; /* ascii, possibly NUL */
+ n2--;
+ }
+ else if (k <= n2)
+ {
+ c2 = utf_ptr2char(s2);
+ if (c2 >= 256)
+ {
+ s2 += k; /* success */
+ n2 -= k;
+ }
+ else
+ {
+ c2 = -1; /* incomplete or illegal */
+ }
+ }
+ else
+ {
+ c2 = -1; /* incomplete */
+ }
+ }
+ else
+ {
+ c2 = 0; /* end of string */
+ }
+
+ if (c1 > 0 && c2 > 0)
+ {
+ /* both characters were successfully decoded */
+ k = utf_fold(c1) - utf_fold(c2);
+ if (k != 0)
+ return k;
+ continue;
+ }
+
+ break;
+ }
+
+ if (c1 == 0 || c2 == 0)
+ {
+ /* one of the strings ended. shorter string is always smaller */
+ if (c1 == 0 && c2 == 0)
+ return 0;
+ return c1 == 0 ? -1 : 1;
+ }
+
+ if (!(c1 == -1 && c2 == -1))
+ {
+ /* One of the characters is good and the other is incomplete
+ * or contains an illegal byte. Fold and encode the good one
+ * for bytewise comparison. */
+ if (c1 == -1)
+ {
+ n2 = utf_char2bytes(utf_fold(c2), buffer);
+ s2 = buffer;
+ }
+ else
+ {
+ n1 = utf_char2bytes(utf_fold(c1), buffer);
+ s1 = buffer;
+ }
+ }
+
+ /* do bytewise comparison */
+
+ while (n1 > 0 && n2 > 0 && *s1 != NUL && *s2 != NUL)
+ {
+ k = (int)(*s1) - (int)(*s2);
+ if (k != 0)
+ return k;
+
+ s1++;
+ s2++;
+ n1--;
+ n2--;
+ }
+
+ if (n1 > 0 && *s1 == NUL)
+ n1 = 0;
+ if (n2 > 0 && *s2 == NUL)
+ n2 = 0;
+
+ if (n1 == 0 && n2 == 0)
+ return 0;
+ return n1 == 0 ? -1 : 1;
+}
+
/*
* Version of strnicmp() that handles multi-byte characters.
* Needed for Big5, Sjift-JIS and UTF-8 encoding. Other DBCS encodings can
@@ -3097,44 +3234,17 @@ mb_strnicmp(s1, s2, nn)
int incomplete = FALSE;
int n = (int)nn;

- for (i = 0; i < n; i += l)
+ if (enc_utf8)
{
- if (s1[i] == NUL && s2[i] == NUL) /* both strings end */
- return 0;
- if (enc_utf8)
- {
- l = utf_byte2len(s1[i]);
- if (l > n - i)
- {
- l = n - i; /* incomplete character */
- incomplete = TRUE;
- }
- /* Check directly first, it's faster. */
- for (j = 0; j < l; ++j)
- {
- if (s1[i + j] != s2[i + j])
- break;
- if (s1[i + j] == 0)
- /* Both stings have the same bytes but are incomplete or
- * have illegal bytes, accept them as equal. */
- l = j;
- }
- if (j < l)
- {
- /* If one of the two characters is incomplete return -1. */
- if (incomplete || i + utf_byte2len(s2[i]) > n)
- return -1;
- /* Don't case-fold illegal bytes or truncated characters. */
- if (utf_ptr2len(s1 + i) < l || utf_ptr2len(s2 + i) < l)
- return -1;
- cdiff = utf_fold(utf_ptr2char(s1 + i))
- - utf_fold(utf_ptr2char(s2 + i));
- if (cdiff != 0)
- return cdiff;
- }
- }
- else
- {
+ return utf_strnicmp(s1, s2, nn, nn);
+ }
+ else
+ {
+ for (i = 0; i < n; i += l)
+ {
+ if (s1[i] == NUL && s2[i] == NUL) /* both strings end */
+ return 0;
+
l = (*mb_ptr2len)(s1 + i);
if (l <= 1)
{
On Wed, May 25, 2011 at 14:09, Bram Moolenaar <Bram@moolenaar.net> wrote:
> Yes, this code just returns -1, no matter if the first or second string
> is bigger.
>
> Your other remark about difference in byte length of a character is
> right, but it's not so easy to fix.  Can you suggest a patch?
> Preferably with a test.

Hi, here's my patch for mbyte.c and a few testcases.

I've eliminated those return -1's by doing a bytewise comparison of
strings after the first corrupted character. This should make the
comparisons transitive at least.

--
You received this message from the "vim_use" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

No comments:

Post a Comment