--- grep-2.5.1/src/search.c.egf-speedup 2004-11-03 17:38:36.338557746 +0000 +++ grep-2.5.1/src/search.c 2004-11-03 17:39:51.853925940 +0000 @@ -70,9 +70,6 @@ call the regexp matcher at all. */ static int kwset_exact_matches; -#if defined(MBS_SUPPORT) -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); -#endif static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); @@ -141,47 +138,6 @@ } } -#ifdef MBS_SUPPORT -/* This function allocate the array which correspond to "buf". - Then this check multibyte string and mark on the positions which - are not singlebyte character nor the first byte of a multibyte - character. Caller must free the array. */ -static char* -check_multibyte_string(char const *buf, size_t size) -{ - char *mb_properties = xmalloc(size); - mbstate_t cur_state; - wchar_t wc; - int i; - memset(&cur_state, 0, sizeof(mbstate_t)); - memset(mb_properties, 0, sizeof(char)*size); - for (i = 0; i < size ;) - { - size_t mbclen; - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); - - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) - { - /* An invalid sequence, or a truncated multibyte character. - We treat it as a singlebyte character. */ - mbclen = 1; - } - else if (match_icase) - { - if (iswupper((wint_t)wc)) - { - wc = towlower((wint_t)wc); - wcrtomb(buf + i, wc, &cur_state); - } - } - mb_properties[i] = mbclen; - i += mbclen; - } - - return mb_properties; -} -#endif - static void Gcompile (char const *pattern, size_t size) { @@ -350,18 +306,9 @@ struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - buf = case_buf; - } - if (kwset) - mb_properties = check_multibyte_string(buf, size); - } + size_t n; + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ buflim = buf + size; @@ -376,15 +323,50 @@ size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); if (offset == (size_t) -1) goto failure; +#ifdef MBS_SUPPORT + n = offset; + while (n) + { + size_t len = mbrlen (beg, n, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + /* Incomplete character. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg += n; + break; + } + + beg += len; + n -= len; + } + if (n) + continue; + + /* Narrow down to the line containing the candidate, and + run it through DFA. */ + end = beg; + while (end < buflim) + { + size_t len = mbrlen (end, buflim - end, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + continue; + } + if (len == 1 && *end == eol) + break; + + end += len; + } + end++; +#else beg += offset; /* Narrow down to the line containing the candidate, and run it through DFA. */ end = memchr(beg, eol, buflim - beg); end++; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) - continue; -#endif +#endif /* MBS_SUPPORT */ + /* Hmm, is this correct for multibyte? */ while (beg > buf && beg[-1] != eol) --beg; if (kwsm.index < kwset_exact_matches) @@ -399,9 +381,44 @@ if (offset == (size_t) -1) break; /* Narrow down to the line we've found. */ +#ifdef MBS_SUPPORT + n = offset; + while (n) + { + size_t len = mbrlen (beg, n, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + /* Incomplete character. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg += n; + continue; + } + + beg += len; + n -= len; + } + end = beg; + while (end < buflim) + { + size_t len = mbrlen (end, buflim - end, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + continue; + } + + if (len == 1 && *end == eol) + break; + + end += len; + } + end++; +#else beg += offset; end = memchr (beg, eol, buflim - beg); end++; +#endif /* MBS_SUPPORT */ + /* Hmm, is this correct for multibyte? */ while (beg > buf && beg[-1] != eol) --beg; } @@ -469,15 +486,6 @@ } /* for (beg = end ..) */ failure: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ return (size_t) -1; success_in_beg_and_end: @@ -486,15 +494,6 @@ /* FALLTHROUGH */ success_in_start_and_len: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ *match_size = len; return start; } @@ -531,29 +530,37 @@ struct kwsmatch kwsmatch; size_t ret_val; #ifdef MBS_SUPPORT - char *mb_properties = NULL; - if (MB_CUR_MAX > 1) - { - if (match_icase) - { - char *case_buf = xmalloc(size); - memcpy(case_buf, buf, size); - buf = case_buf; - } - mb_properties = check_multibyte_string(buf, size); - } + mbstate_t mbs; + memset (&mbs, '\0', sizeof (mbstate_t)); #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) { +#ifdef MBS_SUPPORT + size_t n; +#endif /* MBS_SUPPORT */ size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); if (offset == (size_t) -1) goto failure; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) - continue; /* It is a part of multibyte character. */ -#endif /* MBS_SUPPORT */ + n = offset; + while (n) + { + size_t len = mbrlen (beg, n, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + /* Incomplete character. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg += n; + continue; + } + + beg += len; + n -= len; + } +#else beg += offset; +#endif /* MBS_SUPPORT */ len = kwsmatch.size[0]; if (exact && !match_words) goto success_in_beg_and_len; @@ -587,7 +594,25 @@ if (offset == -1) { break; /* Try a different anchor. */ } +#ifdef MBS_SUPPORT + n = offset; + while (n) + { + size_t len = mbrlen (beg, n, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + /* Incomplete character. */ + memset (&mbs, '\0', sizeof (mbstate_t)); + beg += n; + continue; + } + + beg += len; + n -= len; + } +#else beg += offset; +#endif /* MBS_SUPPORT */ len = kwsmatch.size[0]; } } @@ -597,20 +622,30 @@ } failure: -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (match_icase) - free((char *) buf); - if (mb_properties) - free(mb_properties); - } -#endif /* MBS_SUPPORT */ return -1; success: +#ifdef MBS_SUPPORT + end = beg + len; + while (end < buf + size) + { + size_t len = mbrlen (end, buf + size - end, &mbs); + if (len == (size_t) -1 || len == (size_t) -2 || len == 0) + { + memset (&mbs, '\0', sizeof (mbstate_t)); + continue; + } + if (len == 1 && *end == eol) + break; + + end += len; + } + end++; +#else end = memchr (beg + len, eol, (buf + size) - (beg + len)); end++; +#endif /* MBS_SUPPORT */ + /* Hmm, is this correct for multibyte? */ while (buf < beg && beg[-1] != eol) --beg; len = end - beg; @@ -618,15 +653,6 @@ success_in_beg_and_len: *match_size = len; -#ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) - { - if (mb_properties) - free (mb_properties); - if (match_icase) - free ((char *) buf); - } -#endif /* MBS_SUPPORT */ return beg - buf; }