• Skip to content
  • Skip to link menu
Trinity API Reference
  • Trinity API Reference
  • kjs
 

kjs

  • kjs
regexp.cpp
1/*
2 * This file is part of the KDE libraries
3 * Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
4 * Copyright (C) 2003,2004 Apple Computer, Inc.
5 * Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23#include "regexp.h"
24
25#include "lexer.h"
26#include <assert.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30
31using namespace KJS;
32
33#ifdef PCRE_CONFIG_UTF8
34RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
35#endif
36
37RegExp::RegExp(const UString &p, int f)
38 : pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
39{
40 // Determine whether libpcre has unicode support if need be..
41#ifdef PCRE_CONFIG_UTF8
42 if (utf8Support == Unknown) {
43 int supported;
44 pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
45 utf8Support = supported ? Supported : Unsupported;
46 }
47#endif
48
49 nrSubPatterns = 0; // determined in match() with POSIX regex.
50
51 // JS regexps can contain Unicode escape sequences (\uxxxx) which
52 // are rather uncommon elsewhere. As our regexp libs don't understand
53 // them we do the unescaping ourselves internally.
54 // Also make sure to expand out any nulls as pcre_compile
55 // expects null termination..
56 UString intern;
57 const char* const nil = "\\x00";
58 if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
59 bool escape = false;
60 for (int i = 0; i < p.size(); ++i) {
61 UChar c = p[i];
62 if (escape) {
63 escape = false;
64 // we only care about \u
65 if (c == 'u') {
66 // standard unicode escape sequence looks like \uxxxx but
67 // other browsers also accept less then 4 hex digits
68 unsigned short u = 0;
69 int j = 0;
70 for (j = 0; j < 4; ++j) {
71 if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
72 u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
73 ++i;
74 } else {
75 // sequence incomplete. restore index.
76 // TODO: cleaner way to propagate warning
77 fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
78 i -= j;
79 break;
80 }
81 }
82 if (j < 4) {
83 // sequence was incomplete. treat \u as u which IE always
84 // and FF sometimes does.
85 intern.append(UString('u'));
86 } else {
87 c = UChar(u);
88 switch (u) {
89 case 0:
90 // Make sure to encode 0, to avoid terminating the string
91 intern += UString(nil);
92 break;
93 case '^':
94 case '$':
95 case '\\':
96 case '.':
97 case '*':
98 case '+':
99 case '?':
100 case '(': case ')':
101 case '{': case '}':
102 case '[': case ']':
103 case '|':
104 // escape pattern characters have to remain escaped
105 intern.append(UString('\\'));
106 // intentional fallthrough
107 default:
108 intern += UString(&c, 1);
109 break;
110 }
111 }
112 continue;
113 }
114 intern += UString('\\');
115 intern += UString(&c, 1);
116 } else {
117 if (c == '\\')
118 escape = true;
119 else if (c == '\0')
120 intern += UString(nil);
121 else
122 intern += UString(&c, 1);
123 }
124 }
125 } else {
126 intern = p;
127 }
128
129#ifdef HAVE_PCREPOSIX
130 int pcreflags = 0;
131 const char *perrormsg;
132 int errorOffset;
133
134 if (flgs & IgnoreCase)
135 pcreflags |= PCRE_CASELESS;
136
137 if (flgs & Multiline)
138 pcreflags |= PCRE_MULTILINE;
139
140#ifdef PCRE_CONFIG_UTF8
141 if (utf8Support == Supported)
142 pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
143#endif
144
145 // Fill our buffer with an encoded version, whether utf-8, or,
146 // if PCRE is incapable, truncated.
147 prepareMatch(intern);
148
149 pcregex = pcre_compile(buffer, pcreflags,
150 &perrormsg, &errorOffset, NULL);
151 doneMatch(); // Cleanup buffers
152 if (!pcregex) {
153#ifndef NDEBUG
154 fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
155#endif
156 valid = false;
157 return;
158 }
159
160#ifdef PCRE_INFO_CAPTURECOUNT
161 // Get number of subpatterns that will be returned
162 int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
163 if (rc != 0)
164#endif
165 nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
166
167#else /* HAVE_PCREPOSIX */
168
169 int regflags = 0;
170#ifdef REG_EXTENDED
171 regflags |= REG_EXTENDED;
172#endif
173#ifdef REG_ICASE
174 if ( f & IgnoreCase )
175 regflags |= REG_ICASE;
176#endif
177
178 //NOTE: Multiline is not feasible with POSIX regex.
179 //if ( f & Multiline )
180 // ;
181 // Note: the Global flag is already handled by RegExpProtoFunc::execute
182
183 int errorCode = regcomp(&preg, intern.ascii(), regflags);
184 if (errorCode != 0) {
185#ifndef NDEBUG
186 char errorMessage[80];
187 regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
188 fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
189#endif
190 valid = false;
191 }
192#endif
193}
194
195RegExp::~RegExp()
196{
197 doneMatch(); // Be 100% sure buffers are freed
198#ifdef HAVE_PCREPOSIX
199 if (pcregex)
200 pcre_free(pcregex);
201#else
202 /* TODO: is this really okay after an error ? */
203 regfree(&preg);
204#endif
205}
206
207void RegExp::prepareUtf8(const UString& s)
208{
209 // Allocate a buffer big enough to hold all the characters plus \0
210 const int length = s.size();
211 buffer = new char[length * 3 + 1];
212
213 // Also create buffer for positions. We need one extra character in there,
214 // even past the \0 since the non-empty handling may jump one past the end
215 originalPos = new int[length * 3 + 2];
216
217 // Convert to runs of 8-bit characters, and generate indeces
218 // Note that we do NOT combine surrogate pairs here, as
219 // regexps operate on them as separate characters
220 char *p = buffer;
221 int *posOut = originalPos;
222 const UChar *d = s.data();
223 for (int i = 0; i != length; ++i) {
224 unsigned short c = d[i].unicode();
225
226 int sequenceLen;
227 if (c < 0x80) {
228 *p++ = (char)c;
229 sequenceLen = 1;
230 } else if (c < 0x800) {
231 *p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
232 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
233 sequenceLen = 2;
234 } else {
235 *p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
236 *p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
237 *p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
238 sequenceLen = 3;
239 }
240
241 while (sequenceLen > 0) {
242 *posOut = i;
243 ++posOut;
244 --sequenceLen;
245 }
246 }
247
248 bufferSize = p - buffer;
249
250 *p++ = '\0';
251
252 // Record positions for \0, and the fictional character after that.
253 *posOut = length;
254 *(posOut+1) = length+1;
255}
256
257void RegExp::prepareASCII (const UString& s)
258{
259 originalPos = 0;
260
261 // Best-effort attempt to get something done
262 // when we don't have utf 8 available -- use
263 // truncated version, and pray for the best
264 CString truncated = s.cstring();
265 buffer = new char[truncated.size() + 1];
266 memcpy(buffer, truncated.c_str(), truncated.size());
267 buffer[truncated.size()] = '\0'; // For _compile use
268 bufferSize = truncated.size();
269}
270
271void RegExp::prepareMatch(const UString &s)
272{
273 delete[] originalPos; // Just to be sure..
274 delete[] buffer;
275#ifdef PCRE_CONFIG_UTF8
276 if (utf8Support == Supported)
277 prepareUtf8(s);
278 else
279#endif
280 prepareASCII(s);
281
282#ifndef NDEBUG
283 originalS = s;
284#endif
285}
286
287void RegExp::doneMatch()
288{
289 delete[] originalPos; originalPos = 0;
290 delete[] buffer; buffer = 0;
291}
292
293UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
294{
295#ifndef NDEBUG
296 assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
297#endif
298 assert(valid);
299
300 if (i < 0)
301 i = 0;
302 if (ovector)
303 *ovector = 0L;
304 int dummyPos;
305 if (!pos)
306 pos = &dummyPos;
307 *pos = -1;
308 if (i > s.size() || s.isNull())
309 return UString::null;
310
311#ifdef HAVE_PCREPOSIX
312 int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
313 if (ovector) *ovector = new int[ovecsize];
314 if (!pcregex)
315 return UString::null;
316
317 int startPos;
318 int nextPos;
319
320#ifdef PCRE_CONFIG_UTF8
321 if (utf8Support == Supported) {
322 startPos = i;
323 while (originalPos[startPos] < i)
324 ++startPos;
325
326 nextPos = startPos;
327 if (i < s.size()) {
328 while (originalPos[nextPos] < (i + 1))
329 ++nextPos;
330 }
331 } else
332#endif
333 {
334 startPos = i;
335 nextPos = i + (i < s.size() ? 1 : 0);
336 }
337
338 int baseFlags =
339#ifdef PCRE_CONFIG_UTF8
340 utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
341#endif
342 0;
343 int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
344 m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest
345 ovector ? *ovector : 0L, ovecsize);
346 if (numMatches < 0)
347 {
348 // Failed to match.
349 if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos)
350 {
351 // We set m_notEmpty ourselves, to look for a non-empty match
352 // (see man pcretest or pcretest.c for details).
353 // So we don't stop here, we want to try again at i+1.
354#ifdef KJS_VERBOSE
355 fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
356#endif
357 m_notEmpty = 0;
358 numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
359 ovector ? *ovector : 0L, ovecsize);
360 if (numMatches < 0)
361 return UString::null;
362 }
363 else // done
364 return UString::null;
365 }
366
367 // Got a match, proceed with it.
368 // But fix up the ovector if need be..
369 if (ovector && originalPos) {
370 for (unsigned c = 0; c < 2 * TQMIN((unsigned)numMatches, nrSubPatterns+1); ++c) {
371 if ((*ovector)[c] != -1)
372 (*ovector)[c] = originalPos[(*ovector)[c]];
373 }
374 }
375
376 if (!ovector)
377 return UString::null; // don't rely on the return value if you pass ovector==0
378#else
379 const uint maxMatch = 10;
380 regmatch_t rmatch[maxMatch];
381
382 char *str = strdup(s.ascii()); // TODO: why ???
383 if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
384 free(str);
385 return UString::null;
386 }
387 free(str);
388
389 if (!ovector) {
390 *pos = rmatch[0].rm_so + i;
391 return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
392 }
393
394 // map rmatch array to ovector used in PCRE case
395 nrSubPatterns = 0;
396 for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
397 nrSubPatterns++;
398 // if the nonEmpty flag is set, return a failed match if any of the
399 // subMatches happens to be an empty string.
400 if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
401 return UString::null;
402 }
403 // Allow an ovector slot to return the (failed) match result.
404 if (nrSubPatterns == 0) nrSubPatterns = 1;
405
406 int ovecsize = (nrSubPatterns)*3; // see above
407 *ovector = new int[ovecsize];
408 for (uint j = 0; j < nrSubPatterns; j++) {
409 (*ovector)[2*j] = rmatch[j].rm_so + i;
410 (*ovector)[2*j+1] = rmatch[j].rm_eo + i;
411 }
412#endif
413
414 *pos = (*ovector)[0];
415 if ( *pos == (*ovector)[1] && (flgs & Global) )
416 {
417 // empty match, next try will be with m_notEmpty=true
418 m_notEmpty=true;
419 }
420 return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
421}
422
423#if 0 // unused
424bool RegExp::test(const UString &s, int)
425{
426#ifdef HAVE_PCREPOSIX
427 int ovector[300];
428 CString buffer(s.cstring());
429
430 if (s.isNull() ||
431 pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
432 0, ovector, 300) == PCRE_ERROR_NOMATCH)
433 return false;
434 else
435 return true;
436
437#else
438
439 char *str = strdup(s.ascii());
440 int r = regexec(&preg, str, 0, 0, 0);
441 free(str);
442
443 return r == 0;
444#endif
445}
446#endif
KJS::CString
8 bit char based string class
Definition: ustring.h:165
KJS::UString
Unicode string class.
Definition: ustring.h:189
KJS::UString::find
int find(const UString &f, int pos=0) const
Definition: ustring.cpp:798
KJS::UString::ascii
char * ascii() const
Convert the Unicode string to plain ASCII chars chopping of any higher bytes.
Definition: ustring.cpp:485
KJS::UString::isNull
bool isNull() const
Definition: ustring.h:343
KJS::UString::size
int size() const
Definition: ustring.h:359
KJS::UString::substr
UString substr(int pos=0, int len=-1) const
Definition: ustring.cpp:868
KJS::UString::cstring
CString cstring() const
Definition: ustring.cpp:480
KJS::UString::data
const UChar * data() const
Definition: ustring.h:339
KJS::UString::append
UString & append(const UString &)
Append another string.
Definition: ustring.cpp:457
KJS::UChar
Unicode character.
Definition: ustring.h:51
KJS::UChar::unicode
unsigned short unicode() const
Definition: ustring.h:81

kjs

Skip menu "kjs"
  • Main Page
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Class Members
  • Related Pages

kjs

Skip menu "kjs"
  • arts
  • dcop
  • dnssd
  • interfaces
  •   kspeech
  •     interface
  •     library
  •   tdetexteditor
  • kate
  • kded
  • kdoctools
  • kimgio
  • kjs
  • libtdemid
  • libtdescreensaver
  • tdeabc
  • tdecmshell
  • tdecore
  • tdefx
  • tdehtml
  • tdeinit
  • tdeio
  •   bookmarks
  •   httpfilter
  •   kpasswdserver
  •   kssl
  •   tdefile
  •   tdeio
  •   tdeioexec
  • tdeioslave
  •   http
  • tdemdi
  •   tdemdi
  • tdenewstuff
  • tdeparts
  • tdeprint
  • tderandr
  • tderesources
  • tdespell2
  • tdesu
  • tdeui
  • tdeunittest
  • tdeutils
  • tdewallet
Generated for kjs by doxygen 1.9.4
This website is maintained by Timothy Pearson.