diff --git a/grep-2.5.1-egf-speedup.patch b/grep-2.5.1-egf-speedup.patch index 749ca7d..ab840af 100644 --- a/grep-2.5.1-egf-speedup.patch +++ b/grep-2.5.1-egf-speedup.patch @@ -1,16 +1,46 @@ ---- grep-2.5.1/src/search.c.egf-speedup 2004-11-05 12:50:25.934736684 +0000 -+++ grep-2.5.1/src/search.c 2004-11-05 13:52:33.819394140 +0000 -@@ -70,9 +70,6 @@ +--- grep-2.5.1/src/search.c 2004-12-14 15:08:58.985159277 +0000 ++++ grep-2.5.1/src/search.c 2004-12-14 15:55:21.257729918 +0000 +@@ -39,6 +39,9 @@ + #ifdef HAVE_LIBPCRE + # include + #endif ++#ifdef HAVE_LANGINFO_CODESET ++# include ++#endif + + #define NCHAR (UCHAR_MAX + 1) + +@@ -70,9 +73,10 @@ call the regexp matcher at all. */ static int kwset_exact_matches; -#if defined(MBS_SUPPORT) -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); -#endif ++/* UTF-8 encoding allows some optimizations that we can't otherwise ++ assume in a multibyte encoding. */ ++static int using_utf8; ++ static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); -@@ -141,47 +138,6 @@ +@@ -84,6 +88,15 @@ + static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); + + void ++check_utf8 (void) ++{ ++#ifdef HAVE_LANGINFO_CODESET ++ if (strcmp (nl_langinfo (CODESET), "UTF-8") == 0) ++ using_utf8 = 1; ++#endif ++} ++ ++void + dfaerror (char const *mesg) + { + error (2, 0, mesg); +@@ -141,47 +154,6 @@ } } @@ -58,7 +88,23 @@ static void Gcompile (char const *pattern, size_t size) { -@@ -350,18 +306,8 @@ +@@ -190,6 +162,7 @@ + size_t total = size; + char const *motif = pattern; + ++ check_utf8 (); + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE | (match_icase ? RE_ICASE : 0)); + dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); + +@@ -266,6 +239,7 @@ + size_t total = size; + char const *motif = pattern; + ++ check_utf8 (); + if (strcmp (matcher, "awk") == 0) + { + re_set_syntax (RE_SYNTAX_AWK | (match_icase ? RE_ICASE : 0)); +@@ -350,18 +324,8 @@ struct kwsmatch kwsm; size_t i, ret_val; #ifdef MBS_SUPPORT @@ -79,7 +125,7 @@ #endif /* MBS_SUPPORT */ buflim = buf + size; -@@ -373,18 +319,48 @@ +@@ -373,18 +337,48 @@ if (kwset) { /* Find a possible match using the KWset matcher. */ @@ -90,7 +136,7 @@ if (offset == (size_t) -1) goto failure; +#ifdef MBS_SUPPORT -+ if (MB_CUR_MAX > 1) ++ if (MB_CUR_MAX > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) @@ -130,7 +176,7 @@ while (beg > buf && beg[-1] != eol) --beg; if (kwsm.index < kwset_exact_matches) -@@ -395,13 +371,47 @@ +@@ -395,13 +389,47 @@ else { /* No good fixed strings; start with DFA. */ @@ -142,7 +188,7 @@ break; /* Narrow down to the line we've found. */ +#ifdef MBS_SUPPORT -+ if (MB_CUR_MAX > 1) ++ if (MB_CUR_MAX > 1 && !using_utf8) + { + bytes_left = offset; + while (bytes_left) @@ -178,7 +224,7 @@ while (beg > buf && beg[-1] != eol) --beg; } -@@ -469,15 +479,6 @@ +@@ -469,15 +497,6 @@ } /* for (beg = end ..) */ failure: @@ -194,7 +240,7 @@ return (size_t) -1; success_in_beg_and_end: -@@ -486,15 +487,6 @@ +@@ -486,15 +505,6 @@ /* FALLTHROUGH */ success_in_start_and_len: @@ -210,7 +256,15 @@ *match_size = len; return start; } -@@ -531,17 +523,8 @@ +@@ -504,6 +514,7 @@ + { + char const *beg, *lim, *err; + ++ check_utf8 (); + kwsinit (); + beg = pattern; + do +@@ -531,17 +542,8 @@ struct kwsmatch kwsmatch; size_t ret_val; #ifdef MBS_SUPPORT @@ -230,13 +284,13 @@ #endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) -@@ -550,8 +533,33 @@ +@@ -550,8 +552,33 @@ if (offset == (size_t) -1) goto failure; #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) - continue; /* It is a part of multibyte character. */ -+ if (MB_CUR_MAX > 1) ++ if (MB_CUR_MAX > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) @@ -266,12 +320,12 @@ #endif /* MBS_SUPPORT */ beg += offset; len = kwsmatch.size[0]; -@@ -587,6 +595,36 @@ +@@ -587,6 +614,36 @@ if (offset == -1) { break; /* Try a different anchor. */ } +#ifdef MBS_SUPPORT -+ if (MB_CUR_MAX > 1) ++ if (MB_CUR_MAX > 1 && !using_utf8) + { + size_t bytes_left = offset; + while (bytes_left) @@ -303,47 +357,47 @@ beg += offset; len = kwsmatch.size[0]; } -@@ -597,20 +635,30 @@ +@@ -597,19 +654,31 @@ } failure: --#ifdef MBS_SUPPORT ++ return -1; ++ ++ success: + #ifdef MBS_SUPPORT - if (MB_CUR_MAX > 1) -- { ++ if (MB_CUR_MAX > 1 && !using_utf8) + { - if (match_icase) - free((char *) buf); - if (mb_properties) - free(mb_properties); -- } --#endif /* MBS_SUPPORT */ - return -1; - - success: -+#ifdef MBS_SUPPORT -+ end = beg + len; -+ while (end < buf + size) -+ { -+ size_t len = mbrlen (end, buf + size - end, &mbs); -+ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ end = beg + len; ++ while (end < buf + size) + { -+ memset (&mbs, '\0', sizeof (mbstate_t)); -+ len = 1; -+ } -+ if (len == 1 && *end == eol) -+ break; ++ size_t len = mbrlen (end, buf + size - end, &mbs); ++ if (len == (size_t) -1 || len == (size_t) -2 || len == 0) ++ { ++ memset (&mbs, '\0', sizeof (mbstate_t)); ++ len = 1; ++ } ++ if (len == 1 && *end == eol) ++ break; + -+ end += len; -+ } -+ end++; -+#else ++ end += len; ++ } + } ++ else + #endif /* MBS_SUPPORT */ +- return -1; +- +- success: end = memchr (beg + len, eol, (buf + size) - (beg + len)); ++ end++; -+#endif /* MBS_SUPPORT */ -+ /* Hmm, is this correct for multibyte? */ while (buf < beg && beg[-1] != eol) --beg; - len = end - beg; -@@ -618,15 +666,6 @@ +@@ -618,15 +687,6 @@ success_in_beg_and_len: *match_size = len; diff --git a/grep.spec b/grep.spec index 98dfeab..42f9ca9 100644 --- a/grep.spec +++ b/grep.spec @@ -1,7 +1,7 @@ Summary: The GNU versions of grep pattern matching utilities. Name: grep Version: 2.5.1 -Release: 41 +Release: 42 License: GPL Group: Applications/Text Source: ftp://ftp.gnu.org/pub/gnu/grep/grep-%{version}.tar.bz2 @@ -85,6 +85,10 @@ fi %{_mandir}/*/* %changelog +* Tue Dec 14 2004 Tim Waugh 2.5.1-42 +- Further UTF-8 processing avoided since a '\n' byte is always an + end-of-line character in that encoding. + * Fri Dec 3 2004 Tim Waugh 2.5.1-41 - Fixed a busy loop in the egf-speedup patch (bug #140781).