diff --git a/grep-2.6.3-dfa-convert-to-wide-char.patch b/grep-2.6.3-dfa-convert-to-wide-char.patch new file mode 100644 index 0000000..250b6c0 --- /dev/null +++ b/grep-2.6.3-dfa-convert-to-wide-char.patch @@ -0,0 +1,162 @@ +From ff191d4667709b52758fcc5bdc568726d1616be4 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 4 May 2010 17:26:09 +0200 +Subject: [PATCH] dfa: convert to wide character line-by-line +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This provides a nice speedup for -m in general, but especially +it avoids quadratic complexity in case we have to go to glibc. + +Testcases: + + # From upstream backref-multibyte-slow + yes aba | sed 10000q > aba.txt + time ./egrep -c '^([a-z]).\1$' aba.txt + + # From rbiba + time grep '^[a-f][h-j][l-ž]$' cestina-sorted.txt + +* src/dfa.c (prepare_wc_buf): Extract out of dfaexec. Convert +only up to the next newline. +(dfaexec): Exit multibyte processing loop if past buf_end. +Call prepare_wc_buf again after processing a newline. +--- + src/dfa.c | 96 +++++++++++++++++++++++++++++++++++++----------------------- + 1 files changed, 59 insertions(+), 37 deletions(-) + +diff --git a/src/dfa.c b/src/dfa.c +index 523fe05..70aa5a8 100644 +--- a/src/dfa.c ++++ b/src/dfa.c +@@ -2824,6 +2824,53 @@ transit_state (struct dfa *d, int s, unsigned char const **pp) + + #endif /* MBS_SUPPORT */ + ++/* Initialize mblen_buf and inputwcs with data from the next line. */ ++ ++static void ++prepare_wc_buf (const char *begin, const char *end) ++{ ++ unsigned char eol = eolbyte; ++ size_t remain_bytes, i; ++ ++ buf_begin = (unsigned char *) begin; ++ ++ remain_bytes = 0; ++ for (i = 0; i < end - begin + 1; i++) ++ { ++ if (remain_bytes == 0) ++ { ++ remain_bytes ++ = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); ++ if (remain_bytes < 1 ++ || remain_bytes == (size_t) -1 ++ || remain_bytes == (size_t) -2 ++ || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i])) ++ { ++ remain_bytes = 0; ++ inputwcs[i] = (wchar_t)begin[i]; ++ mblen_buf[i] = 0; ++ if (begin[i] == eol) ++ break; ++ } ++ else ++ { ++ mblen_buf[i] = remain_bytes; ++ remain_bytes--; ++ } ++ } ++ else ++ { ++ mblen_buf[i] = remain_bytes; ++ inputwcs[i] = 0; ++ remain_bytes--; ++ } ++ } ++ ++ buf_end = (unsigned char *) (begin + i); ++ mblen_buf[i] = 0; ++ inputwcs[i] = 0; /* sentinel */ ++} ++ + /* Search through a buffer looking for a match to the given struct dfa. + Find the first occurrence of a string matching the regexp in the + buffer, and the shortest possible version thereof. Return a pointer to +@@ -2870,43 +2917,10 @@ dfaexec (struct dfa *d, char const *begin, char *end, + #ifdef MBS_SUPPORT + if (d->mb_cur_max > 1) + { +- int remain_bytes, i; +- buf_begin = (unsigned char *) begin; +- buf_end = (unsigned char *) end; +- +- /* initialize mblen_buf, and inputwcs. */ + MALLOC(mblen_buf, unsigned char, end - begin + 2); + MALLOC(inputwcs, wchar_t, end - begin + 2); + memset(&mbs, 0, sizeof(mbstate_t)); +- remain_bytes = 0; +- for (i = 0; i < end - begin + 1; i++) +- { +- if (remain_bytes == 0) +- { +- remain_bytes +- = mbrtowc(inputwcs + i, begin + i, end - begin - i + 1, &mbs); +- if (remain_bytes < 1 +- || (remain_bytes == 1 && inputwcs[i] == (wchar_t)begin[i])) +- { +- remain_bytes = 0; +- inputwcs[i] = (wchar_t)begin[i]; +- mblen_buf[i] = 0; +- } +- else +- { +- mblen_buf[i] = remain_bytes; +- remain_bytes--; +- } +- } +- else +- { +- mblen_buf[i] = remain_bytes; +- inputwcs[i] = 0; +- remain_bytes--; +- } +- } +- mblen_buf[i] = 0; +- inputwcs[i] = 0; /* sentinel */ ++ prepare_wc_buf (p, end); + } + #endif /* MBS_SUPPORT */ + +@@ -2916,7 +2930,7 @@ dfaexec (struct dfa *d, char const *begin, char *end, + if (d->mb_cur_max > 1) + while ((t = trans[s])) + { +- if ((char *) p > end) ++ if (p > buf_end) + break; + s1 = s; + SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); +@@ -2985,8 +2999,16 @@ dfaexec (struct dfa *d, char const *begin, char *end, + } + + /* If the previous character was a newline, count it. */ +- if (count && (char *) p <= end && p[-1] == eol) +- ++*count; ++ if ((char *) p <= end && p[-1] == eol) ++ { ++ if (count) ++ ++*count; ++ ++#ifdef MBS_SUPPORT ++ if (d->mb_cur_max > 1) ++ prepare_wc_buf (p, end); ++#endif ++ } + + /* Check if we've run off the end of the buffer. */ + if ((char *) p > end) +-- +1.6.6.1 + diff --git a/grep-2.6.3-dfa-optimize-period.patch b/grep-2.6.3-dfa-optimize-period.patch new file mode 100644 index 0000000..c83cc02 --- /dev/null +++ b/grep-2.6.3-dfa-optimize-period.patch @@ -0,0 +1,165 @@ +From 01422220ebf40f829c1f00418a96873b82f206ff Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Mon, 19 Apr 2010 14:50:23 +0200 +Subject: [PATCH 1/2] dfa: optimize UTF-8 period + +Backport of upstream commits 7a0ad00 and 42ac56a. + +* src/dfa.h (struct dfa): Add utf8_anychar_classes. +* src/dfa.c (add_utf8_anychar): New. +(atom): Simplify if/else nesting. Call add_utf8_anychar for ANYCHAR +in UTF-8 locales. +(dfaoptimize): Abort on ANYCHAR. +--- + src/dfa.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- + src/dfa.h | 1 + + 2 files changed, 82 insertions(+), 14 deletions(-) + +diff --git a/src/dfa.c b/src/dfa.c +index ba78b08..e13c361 100644 +--- a/src/dfa.c ++++ b/src/dfa.c +@@ -1191,6 +1191,55 @@ addtok_wc (wint_t wc) + } + #endif + ++static void ++add_utf8_anychar (void) ++{ ++ static const charclass utf8_classes[5] = { ++ { 0, 0, 0, 0, ~0, ~0, 0, 0 }, /* 80-bf: non-lead bytes */ ++ { ~0, ~0, ~0, ~0, ~0, ~0, 0, 0xff000000 }, /* 00-bf, f8-ff: 1-byte/invalid */ ++ { 0, 0, 0, 0, 0, 0, ~0, 0 }, /* c0-df: 2-byte sequence */ ++ { 0, 0, 0, 0, 0, 0, 0, 0xffff }, /* e0-ef: 3-byte sequence */ ++ { 0, 0, 0, 0, 0, 0, 0, 0xff0000 } /* f0-f7: 4-byte sequence */ ++ }; ++ const unsigned int n = sizeof (utf8_classes) / sizeof (utf8_classes[0]); ++ unsigned int i; ++ ++ /* Define the five character classes that are needed below. */ ++ if (dfa->utf8_anychar_classes[0] == 0) ++ for (i = 0; i < n; i++) ++ { ++ charclass c; ++ memcpy (c, utf8_classes[i], sizeof c); ++ if (i == 1) ++ { ++ if (!(syntax_bits & RE_DOT_NEWLINE)) ++ clrbit (eolbyte, c); ++ if (syntax_bits & RE_DOT_NOT_NULL) ++ clrbit ('\0', c); ++ } ++ dfa->utf8_anychar_classes[i] = CSET + charclass_index(c); ++ } ++ ++ /* A valid UTF-8 character is ++ ++ ([0x00-0x7f] ++ |[0xc2-0xdf][0x80-0xbf] ++ |[0xe0-0xef[0x80-0xbf][0x80-0xbf] ++ |[0xf0-f7][0x80-0xbf][0x80-0xbf][0x80-0xbf]) ++ ++ which I'll write more concisely "B|CA|DAA|EAAA". Factor the [0x80-0xbf] ++ and you get "B|(C|(D|EA)A)A". And since the token buffer is in reverse ++ Polish notation, you get "B C D E A CAT OR A CAT OR A CAT OR". */ ++ for (i = 1; i < n; i++) ++ addtok (dfa->utf8_anychar_classes[i]); ++ while (--i > 1) ++ { ++ addtok (dfa->utf8_anychar_classes[0]); ++ addtok (CAT); ++ addtok (OR); ++ } ++} ++ + /* The grammar understood by the parser is as follows. + + regexp: +@@ -1229,8 +1278,12 @@ addtok_wc (wint_t wc) + static void + atom (void) + { ++ if (0) ++ { ++ /* empty */ ++ } + #ifdef MBS_SUPPORT +- if (tok == WCHAR) ++ else if (tok == WCHAR) + { + addtok_wc (case_fold ? towlower(wctok) : wctok); + #ifndef GREP +@@ -1242,16 +1295,28 @@ atom (void) + #endif + + tok = lex(); +- return; ++ } ++ ++ else if (tok == ANYCHAR && using_utf8()) ++ { ++ /* For UTF-8 expand the period to a series of CSETs that define a valid ++ UTF-8 character. This avoids using the slow multibyte path. I'm ++ pretty sure it would be both profitable and correct to do it for ++ any encoding; however, the optimization must be done manually as ++ it is done above in add_utf8_anychar. So, let's start with ++ UTF-8: it is the most used, and the structure of the encoding ++ makes the correctness more obvious. */ ++ add_utf8_anychar(); ++ tok = lex(); + } + #endif /* MBS_SUPPORT */ + +- if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF +- || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD ++ else if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF ++ || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD + #ifdef MBS_SUPPORT +- || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ ++ || tok == ANYCHAR || tok == MBCSET + #endif /* MBS_SUPPORT */ +- || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) ++ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) + { + addtok(tok); + tok = lex(); +@@ -3027,14 +3092,16 @@ dfaoptimize (struct dfa *d) + for (i = 0; i < d->tindex; ++i) + { + switch(d->tokens[i]) +- { +- case ANYCHAR: +- case MBCSET: +- /* Requires multi-byte algorithm. */ +- return; +- default: +- break; +- } ++ { ++ case ANYCHAR: ++ /* Lowered. */ ++ abort (); ++ case MBCSET: ++ /* Requires multi-byte algorithm. */ ++ return; ++ default: ++ break; ++ } + } + + free_mbdata (d); +diff --git a/src/dfa.h b/src/dfa.h +index 1c85207..42c177a 100644 +--- a/src/dfa.h ++++ b/src/dfa.h +@@ -283,6 +283,7 @@ struct dfa + with dfaparse(). */ + #ifdef MBS_SUPPORT + unsigned int mb_cur_max; /* Cached value of MB_CUR_MAX. */ ++ int utf8_anychar_classes[5]; /* To lower ANYCHAR in UTF-8 locales. */ + + /* The following are used only if MB_CUR_MAX > 1. */ + +-- +1.6.6.1 + diff --git a/grep-2.6.3-dfa-speedup-digit-xdigit.patch b/grep-2.6.3-dfa-speedup-digit-xdigit.patch new file mode 100644 index 0000000..984c3c7 --- /dev/null +++ b/grep-2.6.3-dfa-speedup-digit-xdigit.patch @@ -0,0 +1,114 @@ +From ebca24d6c9eb12f91eed3993de65945ee97dd467 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 4 May 2010 18:07:28 +0200 +Subject: [PATCH] dfa: speed up [[:digit:]] and [[:xdigit:]] + +There's no "multibyte pain" in these two classes, since POSIX +and ISO C99 mandate their contents. + +Time for "./grep -x '[[:digit:]]' /usr/share/dict/linux.words" +Before: 1.5s, after: 0.07s. (sed manages only 0.5s). + +* src/dfa.c (predicates): Declare struct dfa_ctype separately +from definition. Add sb_only. +(find_pred): Return const struct dfa_ctype *. +(parse_bracket_exp): Return const struct dfa_ctype *. Do +not fill MBCSET for sb_only character types. +--- + src/dfa.c | 55 ++++++++++++++++++++++++++++--------------------------- + 1 files changed, 28 insertions(+), 27 deletions(-) + +diff --git a/src/dfa.c b/src/dfa.c +index 4dd26c9..da5a306 100644 +--- a/src/dfa.c ++++ b/src/dfa.c +@@ -429,26 +429,29 @@ typedef int predicate (int); + /* The following list maps the names of the Posix named character classes + to predicate functions that determine whether a given character is in + the class. The leading [ has already been eaten by the lexical analyzer. */ +-static struct { ++struct dfa_ctype { + const char *name; +- predicate *pred; +-} const prednames[] = { +- { "alpha", is_alpha }, +- { "upper", is_upper }, +- { "lower", is_lower }, +- { "digit", is_digit }, +- { "xdigit", is_xdigit }, +- { "space", is_space }, +- { "punct", is_punct }, +- { "alnum", is_alnum }, +- { "print", is_print }, +- { "graph", is_graph }, +- { "cntrl", is_cntrl }, +- { "blank", is_blank }, +- { NULL, NULL } ++ predicate *func; ++ bool sb_only; + }; + +-static predicate * ++static const struct dfa_ctype prednames[] = { ++ { "alpha", isalpha, false }, ++ { "upper", isupper, false }, ++ { "lower", islower, false }, ++ { "digit", isdigit, true }, ++ { "xdigit", isxdigit, true }, ++ { "space", isspace, false }, ++ { "punct", ispunct, false }, ++ { "alnum", isalnum, false }, ++ { "print", isprint, false }, ++ { "graph", isgraph, false }, ++ { "cntrl", iscntrl, false }, ++ { "blank", isblank, false }, ++ { NULL, NULL, false } ++}; ++ ++static const struct dfa_ctype * + find_pred (const char *str) + { + unsigned int i; +@@ -456,7 +459,7 @@ find_pred (const char *str) + if (!strcmp(str, prednames[i].name)) + break; + +- return prednames[i].pred; ++ return &prednames[i]; + } + + /* Multibyte character handling sub-routine for lex. +@@ -553,8 +556,11 @@ parse_bracket_exp (void) + || !strcmp (str, "lower")) + ? "alpha" + : str); ++ const struct dfa_ctype *pred = find_pred (class); ++ if (!pred) ++ dfaerror(_("invalid character class")); + #ifdef MBS_SUPPORT +- if (MB_CUR_MAX > 1) ++ if (MB_CUR_MAX > 1 && !pred->sb_only) + { + /* Store the character class as wctype_t. */ + wctype_t wt = wctype (class); +@@ -568,14 +574,9 @@ parse_bracket_exp (void) + } + #endif + +- { +- predicate *pred = find_pred (class); +- if (!pred) +- dfaerror(_("invalid character class")); +- for (c2 = 0; c2 < NOTCHAR; ++c2) +- if ((*pred)(c2)) +- setbit_case_fold (c2, ccl); +- } ++ for (c2 = 0; c2 < NOTCHAR; ++c2) ++ if (pred->func(c2)) ++ setbit_case_fold (c2, ccl); + } + + #ifdef MBS_SUPPORT +-- +1.6.6.1 + diff --git a/grep-2.6.3-glibc-matcher-fallback.patch b/grep-2.6.3-glibc-matcher-fallback.patch new file mode 100644 index 0000000..609ee20 --- /dev/null +++ b/grep-2.6.3-glibc-matcher-fallback.patch @@ -0,0 +1,38 @@ +From 3fca11d78cfa1fec6199936d57871b9db08226ab Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Thu, 29 Apr 2010 17:13:32 +0200 +Subject: [PATCH 2/2] fall back to glibc matcher if a MBCSET is found + +This patch works around the performance problems of multibyte grep +upstream. + +For UTF-8 it should trigger only in the presence of MBCSET, e.g. [a-z]. + +For other character sets all brackets and `.` as well will trigger it. +--- + src/dfa.c | 9 +++++++++ + 1 files changed, 9 insertions(+), 0 deletions(-) + +diff --git a/src/dfa.c b/src/dfa.c +index e13c361..523fe05 100644 +--- a/src/dfa.c ++++ b/src/dfa.c +@@ -2927,6 +2927,15 @@ dfaexec (struct dfa *d, char const *begin, char *end, + continue; + } + ++ if (backref) ++ { ++ *backref = 1; ++ free(mblen_buf); ++ free(inputwcs); ++ *end = saved_end; ++ return (char *) p; ++ } ++ + /* Can match with a multibyte character (and multi character + collating element). Transition table might be updated. */ + s = transit_state(d, s, &p); +-- +1.6.6.1 + diff --git a/grep-2.6.3-mmap-option-fix.patch b/grep-2.6.3-mmap-option-fix.patch new file mode 100644 index 0000000..6504dff --- /dev/null +++ b/grep-2.6.3-mmap-option-fix.patch @@ -0,0 +1,26 @@ +From dfa2891e473abdb5507fff65002f946b60145f44 Mon Sep 17 00:00:00 2001 +From: Paolo Bonzini +Date: Tue, 20 Apr 2010 12:32:22 +0200 +Subject: [PATCH] grep: fix --mmap not being ignored + +* NEWS: Document bugfix. +* main.c (main): Ignore MMAP_OPTION. +--- + src/main.c | 1 + + 1 files changed, 1 insertions(+), 0 deletions(-) + +diff --git a/src/main.c b/src/main.c +index 1697c80..0ccf6eb 100644 +--- a/src/main.c ++++ b/src/main.c +@@ -2061,6 +2061,7 @@ main (int argc, char **argv) + label = optarg; + break; + ++ case MMAP_OPTION: + case 0: + /* long options */ + break; +-- +1.6.6.1 + diff --git a/grep.spec b/grep.spec index 3f038bd..35ed50f 100644 --- a/grep.spec +++ b/grep.spec @@ -3,7 +3,7 @@ Summary: Pattern matching utilities Name: grep Version: 2.6.3 -Release: 1%{?dist} +Release: 2%{?dist} License: GPLv3+ Group: Applications/Text Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.xz @@ -13,6 +13,21 @@ Requires(preun): /sbin/install-info BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) BuildRequires: pcre-devel >= 3.9-10, texinfo, gettext BuildRequires: autoconf automake +# Speedups DFA UTF-8 period patterns. +# original name: 0001-dfa-optimize-UTF-8-period.patch +Patch0: grep-2.6.3-dfa-optimize-period.patch +# Speedups MBCSET ([a-z]) UTF-8 patterns. +# original name: 0002-fall-back-to-glibc-matcher-if-a-MBCSET-is-found.patch +Patch1: grep-2.6.3-glibc-matcher-fallback.patch +# Deprecated --mmap option is now properly ignored. +# 0003-grep-fix-mmap-not-being-ignored.patch +Patch2: grep-2.6.3-mmap-option-fix.patch +# Speedups -m and removes quadratic complexity when going to glibc. +# 0004-dfa-convert-to-wide-character-line-by-line.patch +Patch3: grep-2.6.3-dfa-convert-to-wide-char.patch +# Speedups DFA [[:digit:]] and [[:xdigit:]] patterns. +# 0005-dfa-speed-up-digit-and-xdigit.patch +Patch4: grep-2.6.3-dfa-speedup-digit-xdigit.patch %description The GNU versions of commonly used grep utilities. Grep searches through @@ -23,6 +38,11 @@ GNU grep is needed by many scripts, so it shall be installed on every system. %prep %setup -q +%patch0 -p1 -b .dfa-optimize-period +%patch1 -p1 -b .glibc-matcher-fallback +%patch2 -p1 -b .mmap-option-fix +%patch3 -p1 -b .dfa-convert-to-wide-char +%patch4 -p1 -b .speedup-digit-xdigit %build %configure --without-included-regex CPPFLAGS="-I%{_includedir}/pcre" @@ -59,6 +79,14 @@ fi %{_mandir}/*/* %changelog +* Tue May 06 2010 Jaroslav Škarvada - 2.6.3-2 +- Added dfa-optimize-period patch (speedup for . patterns in UTF-8) +- Added glibc-matcher-fallback patch (speedup for [a-z] patterns in UTF-8) +- Added mmap-option-fix patch +- Added dfa-convert-to-wide-char patch (speedup for -m and remove quadratic + complexity when going to glibc) +- Added dfa-speedup-digit-xdigit patch (speedup for [[:digit:]] [:xdigit:]]) + * Sun Apr 04 2010 Jaroslav Škarvada - 2.6.3-1 - New version: grep-2.6.3 - make check is not silent now @@ -77,7 +105,7 @@ fi - Added w patch to fix -w switch behaviour broken by dfa-optional patch * Wed Feb 10 2010 Jaroslav Škarvada - 2.5.4-1 -- New version: grep-2.5.4 (#502931) +- New version: grep-2.5.4 - Fixed typos in %description - Updated utf-8 patch - Added dfa-optional patch (#538423)