From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 4 May 2010 17:26:09 +0200 Subject: [PATCH] dfa: convert to wide character line-by-line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This provides a nice speedup for -m in general, but especially it avoids quadratic complexity in case we have to go to glibc. Testcases: # From upstream backref-multibyte-slow yes aba | sed 10000q > aba.txt time ./egrep -c '^([a-z]).\1$' aba.txt # From rbiba time grep '^[a-f][h-j][l-ΕΎ]$' cestina-sorted.txt * src/dfa.c (prepare_wc_buf): Extract out of dfaexec. Convert only up to the next newline. (dfaexec): Exit multibyte processing loop if past buf_end. Call prepare_wc_buf again after processing a newline. --- src/dfa.c | 96 +++++++++++++++++++++++++++++++++++++----------------------- 1 files changed, 59 insertions(+), 37 deletions(-) diff --git a/src/dfa.c b/src/dfa.c index 523fe05..70aa5a8 100644 --- a/src/dfa.c +++ b/src/dfa.c @@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp) #endif /* MBS_SUPPORT */ +/* Initialize mblen_buf and inputwcs with data from the next line. */ + +static void +prepare_wc_buf (const char *begin, const char *end) +{ + unsigned char eol = eolbyte; + size_t remain_bytes, i; + + buf_begin = (unsigned char *) begin; + + remain_bytes = 0; + for (i = 0; i < end - begin + 1; i++) + { + if (remain_bytes == 0) + { + remain_bytes + = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); + if (remain_bytes < 1 + || remain_bytes == (size_t) -1 + || remain_bytes == (size_t) -2 + || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i])) + { + remain_bytes = 0; + inputwcs[i] = (wchar_t)begin[i]; + mblen_buf[i] = 0; + if (begin[i] == eol) + break; + } + else + { + mblen_buf[i] = remain_bytes; + remain_bytes--; + } + } + else + { + mblen_buf[i] = remain_bytes; + inputwcs[i] = 0; + remain_bytes--; + } + } + + buf_end = (unsigned char *) (begin + i); + mblen_buf[i] = 0; + inputwcs[i] = 0; /* sentinel */ +} + /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, and the shortest possible version thereof. Return a pointer to @@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end, #ifdef MBS_SUPPORT if (d->mb_cur_max > 1) { - int remain_bytes, i; - buf_begin = (unsigned char *) begin; - buf_end = (unsigned char *) end; - - /* initialize mblen_buf, and inputwcs. */ MALLOC(mblen_buf, unsigned char, end - begin + 2); MALLOC(inputwcs, wchar_t, end - begin + 2); memset(&mbs, 0, sizeof(mbstate_t)); - remain_bytes = 0; - for (i = 0; i < end - begin + 1; i++) - { - if (remain_bytes == 0) - { - remain_bytes - = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); - if (remain_bytes < 1 - || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i])) - { - remain_bytes = 0; - inputwcs[i] = (wchar_t)begin[i]; - mblen_buf[i] = 0; - } - else - { - mblen_buf[i] = remain_bytes; - remain_bytes--; - } - } - else - { - mblen_buf[i] = remain_bytes; - inputwcs[i] = 0; - remain_bytes--; - } - } - mblen_buf[i] = 0; - inputwcs[i] = 0; /* sentinel */ + prepare_wc_buf (p, end); } #endif /* MBS_SUPPORT */ @@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, if (d->mb_cur_max > 1) while ((t = trans[s])) { - if ((char *) p > end) + if (p > buf_end) break; s1 = s; SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); @@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end, } /* If the previous character was a newline, count it. */ - if (count && (char *) p <= end && p[-1] == eol) - ++*count; + if ((char *) p <= end && p[-1] == eol) + { + if (count) + ++*count; + +#ifdef MBS_SUPPORT + if (d->mb_cur_max > 1) + prepare_wc_buf (p, end); +#endif + } /* Check if we've run off the end of the buffer. */ if ((char *) p > end) -- 1.6.6.1