mirror of
https://github.com/SickGear/SickGear.git
synced 2025-03-01 18:35:02 +00:00
Shows now display Indexer absolute numbering. Improved speed of parsing search results. Fixed episode naming issues.
22557 lines
678 KiB
C
22557 lines
678 KiB
C
/* Secret Labs' Regular Expression Engine
|
|
*
|
|
* regular expression matching engine
|
|
*
|
|
* partial history:
|
|
* 1999-10-24 fl created (based on existing template matcher code)
|
|
* 2000-03-06 fl first alpha, sort of
|
|
* 2000-08-01 fl fixes for 1.6b1
|
|
* 2000-08-07 fl use PyOS_CheckStack() if available
|
|
* 2000-09-20 fl added expand method
|
|
* 2001-03-20 fl lots of fixes for 2.1b2
|
|
* 2001-04-15 fl export copyright as Python attribute, not global
|
|
* 2001-04-28 fl added __copy__ methods (work in progress)
|
|
* 2001-05-14 fl fixes for 1.5.2 compatibility
|
|
* 2001-07-01 fl added BIGCHARSET support (from Martin von Loewis)
|
|
* 2001-10-18 fl fixed group reset issue (from Matthew Mueller)
|
|
* 2001-10-20 fl added split primitive; reenable unicode for 1.6/2.0/2.1
|
|
* 2001-10-21 fl added sub/subn primitive
|
|
* 2001-10-24 fl added finditer primitive (for 2.2 only)
|
|
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
|
|
* 2002-11-09 fl fixed empty sub/subn return type
|
|
* 2003-04-18 mvl fully support 4-byte codes
|
|
* 2003-10-17 gn implemented non recursive scheme
|
|
* 2009-07-26 mrab completely re-designed matcher code
|
|
* 2011-11-18 mrab added support for PEP 393 strings
|
|
*
|
|
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
|
|
*
|
|
* This version of the SRE library can be redistributed under CNRI's
|
|
* Python 1.6 license. For any other use, please contact Secret Labs
|
|
* AB (info@pythonware.com).
|
|
*
|
|
* Portions of this engine have been developed in cooperation with
|
|
* CNRI. Hewlett-Packard provided funding for 1.6 integration and
|
|
* other compatibility work.
|
|
*/
|
|
|
|
/* #define VERBOSE */
|
|
|
|
#if defined(VERBOSE)
|
|
#define TRACE(X) printf X;
|
|
#else
|
|
#define TRACE(X)
|
|
#endif
|
|
|
|
#include "Python.h"
|
|
#include "structmember.h" /* offsetof */
|
|
#include <ctype.h>
|
|
#include "_regex.h"
|
|
#include "pyport.h"
|
|
#include "pythread.h"
|
|
|
|
#if PY_VERSION_HEX < 0x02060000
|
|
#if SIZEOF_SIZE_T == SIZEOF_LONG_LONG
|
|
#define T_PYSSIZET T_LONGLONG
|
|
#elif SIZEOF_SIZE_T == SIZEOF_LONG
|
|
#define T_PYSSIZET T_LONG
|
|
#else
|
|
#error size_t is the same size as neither LONG nor LONGLONG
|
|
#endif
|
|
|
|
#endif
|
|
typedef unsigned char Py_UCS1;
|
|
typedef unsigned short Py_UCS2;
|
|
|
|
typedef RE_UINT32 RE_CODE;
|
|
|
|
/* Properties in the General Category. */
|
|
#define RE_PROP_GC_CN ((RE_PROP_GC << 16) | RE_PROP_CN)
|
|
#define RE_PROP_GC_LU ((RE_PROP_GC << 16) | RE_PROP_LU)
|
|
#define RE_PROP_GC_LL ((RE_PROP_GC << 16) | RE_PROP_LL)
|
|
#define RE_PROP_GC_LT ((RE_PROP_GC << 16) | RE_PROP_LT)
|
|
#define RE_PROP_GC_P ((RE_PROP_GC << 16) | RE_PROP_P)
|
|
|
|
/* Unlimited repeat count. */
|
|
#define RE_UNLIMITED (~(RE_CODE)0)
|
|
|
|
/* The status of a node. */
|
|
typedef unsigned short RE_STATUS_T;
|
|
|
|
/* Whether to match concurrently, i.e. release the GIL while matching. */
|
|
#define RE_CONC_NO 0
|
|
#define RE_CONC_YES 1
|
|
#define RE_CONC_DEFAULT 2
|
|
|
|
/* the side that could truncate in a partial match.
|
|
*
|
|
* The values RE_PARTIAL_LEFT and RE_PARTIAL_RIGHT are also used as array
|
|
* indexes, so they need to be 0 and 1.
|
|
*/
|
|
#define RE_PARTIAL_NONE -1
|
|
#define RE_PARTIAL_LEFT 0
|
|
#define RE_PARTIAL_RIGHT 1
|
|
|
|
/* Flags for the kind of 'sub' call: 'sub', 'subn', 'subf', 'subfn'. */
|
|
#define RE_SUB 0x0
|
|
#define RE_SUBN 0x1
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
#define RE_SUBF 0x2
|
|
#endif
|
|
|
|
/* The name of this module, minus the leading underscore. */
|
|
#define RE_MODULE "regex"
|
|
|
|
/* Error codes. */
|
|
#define RE_ERROR_SUCCESS 1 /* Successful match. */
|
|
#define RE_ERROR_FAILURE 0 /* Unsuccessful match. */
|
|
#define RE_ERROR_ILLEGAL -1 /* Illegal code. */
|
|
#define RE_ERROR_INTERNAL -2 /* Internal error. */
|
|
#define RE_ERROR_CONCURRENT -3 /* "concurrent" invalid. */
|
|
#define RE_ERROR_MEMORY -4 /* Out of memory. */
|
|
#define RE_ERROR_INTERRUPTED -5 /* Signal handler raised exception. */
|
|
#define RE_ERROR_REPLACEMENT -6 /* Invalid replacement string. */
|
|
#define RE_ERROR_INVALID_GROUP_REF -7 /* Invalid group reference. */
|
|
#define RE_ERROR_GROUP_INDEX_TYPE -8 /* Group index type error. */
|
|
#define RE_ERROR_NO_SUCH_GROUP -9 /* No such group. */
|
|
#define RE_ERROR_INDEX -10 /* String index. */
|
|
#define RE_ERROR_BACKTRACKING -11 /* Too much backtracking. */
|
|
#define RE_ERROR_NOT_STRING -12 /* Not a string. */
|
|
#define RE_ERROR_NOT_UNICODE -13 /* Not a Unicode string. */
|
|
#define RE_ERROR_PARTIAL -15 /* Partial match. */
|
|
|
|
/* The number of backtrack entries per allocated block. */
|
|
#define RE_BACKTRACK_BLOCK_SIZE 64
|
|
|
|
/* The maximum number of backtrack entries to allocate. */
|
|
#define RE_MAX_BACKTRACK_ALLOC (1024 * 1024)
|
|
|
|
/* The initial maximum capacity of the guard block. */
|
|
#define RE_INIT_GUARDS_BLOCK_SIZE 16
|
|
|
|
/* The initial maximum capacity of the node list. */
|
|
#define RE_INIT_NODE_LIST_SIZE 16
|
|
|
|
/* The size increment for various allocation lists. */
|
|
#define RE_LIST_SIZE_INC 16
|
|
|
|
/* The initial maximum capacity of the capture groups. */
|
|
#define RE_INIT_CAPTURE_SIZE 16
|
|
|
|
/* Node bitflags. */
|
|
#define RE_POSITIVE_OP 0x1
|
|
#define RE_ZEROWIDTH_OP 0x2
|
|
#define RE_FUZZY_OP 0x4
|
|
#define RE_REVERSE_OP 0x8
|
|
#define RE_REQUIRED_OP 0x10
|
|
|
|
/* Guards against further matching can occur at the start of the body and the
|
|
* tail of a repeat containing a repeat.
|
|
*/
|
|
#define RE_STATUS_BODY 0x1
|
|
#define RE_STATUS_TAIL 0x2
|
|
|
|
/* Whether a guard is added depends on whether there's a repeat in the body of
|
|
* the repeat or a group reference in the body or tail of the repeat.
|
|
*/
|
|
#define RE_STATUS_NEITHER 0x0
|
|
#define RE_STATUS_REPEAT 0x4
|
|
#define RE_STATUS_LIMITED 0x8
|
|
#define RE_STATUS_REF 0x10
|
|
#define RE_STATUS_VISITED_AG 0x20
|
|
#define RE_STATUS_VISITED_REP 0x40
|
|
|
|
/* Whether a string node has been initialised for fast searching. */
|
|
#define RE_STATUS_FAST_INIT 0x80
|
|
|
|
/* Whether a node us being used. (Additional nodes may be created while the
|
|
* pattern is being built.
|
|
*/
|
|
#define RE_STATUS_USED 0x100
|
|
|
|
/* Whether a node is a string node. */
|
|
#define RE_STATUS_STRING 0x200
|
|
|
|
/* Whether a repeat node is within another repeat. */
|
|
#define RE_STATUS_INNER 0x400
|
|
|
|
/* Various flags stored in a node status member. */
|
|
#define RE_STATUS_SHIFT 11
|
|
|
|
#define RE_STATUS_FUZZY (RE_FUZZY_OP << RE_STATUS_SHIFT)
|
|
#define RE_STATUS_REVERSE (RE_REVERSE_OP << RE_STATUS_SHIFT)
|
|
#define RE_STATUS_REQUIRED (RE_REQUIRED_OP << RE_STATUS_SHIFT)
|
|
|
|
/* The different error types for fuzzy matching. */
|
|
#define RE_FUZZY_SUB 0
|
|
#define RE_FUZZY_INS 1
|
|
#define RE_FUZZY_DEL 2
|
|
#define RE_FUZZY_ERR 3
|
|
#define RE_FUZZY_COUNT 3
|
|
|
|
/* The various values in a FUZZY node. */
|
|
#define RE_FUZZY_VAL_MAX_SUB 1
|
|
#define RE_FUZZY_VAL_MAX_INS 2
|
|
#define RE_FUZZY_VAL_MAX_DEL 3
|
|
#define RE_FUZZY_VAL_MAX_ERR 4
|
|
#define RE_FUZZY_VAL_SUB_COST 5
|
|
#define RE_FUZZY_VAL_INS_COST 6
|
|
#define RE_FUZZY_VAL_DEL_COST 7
|
|
#define RE_FUZZY_VAL_MAX_COST 8
|
|
|
|
#define RE_FUZZY_VAL_MAX_BASE 1
|
|
#define RE_FUZZY_VAL_COST_BASE 5
|
|
|
|
/* The various values in an END_FUZZY node. */
|
|
#define RE_FUZZY_VAL_MIN_SUB 1
|
|
#define RE_FUZZY_VAL_MIN_INS 2
|
|
#define RE_FUZZY_VAL_MIN_DEL 3
|
|
#define RE_FUZZY_VAL_MIN_ERR 4
|
|
|
|
/* The flags which will be set for full Unicode case folding. */
|
|
#define RE_FULL_CASE_FOLDING (RE_FLAG_UNICODE | RE_FLAG_FULLCASE | RE_FLAG_IGNORECASE)
|
|
|
|
/* The shortest string prefix for which we'll use a fast string search. */
|
|
#define RE_MIN_FAST_LENGTH 5
|
|
|
|
static char copyright[] =
|
|
" RE 2.3.0 Copyright (c) 1997-2002 by Secret Labs AB ";
|
|
|
|
/* The exception to raise on error. */
|
|
static PyObject* error_exception;
|
|
|
|
/* The dictionary of Unicode properties. */
|
|
static PyObject* property_dict;
|
|
|
|
typedef struct RE_State* RE_StatePtr;
|
|
|
|
/* Handlers for ASCII, locale and Unicode. */
|
|
typedef struct RE_EncodingTable {
|
|
BOOL (*has_property)(RE_CODE property, Py_UCS4 ch);
|
|
BOOL (*at_boundary)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*at_word_start)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*at_word_end)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*at_default_boundary)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*at_default_word_start)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*at_default_word_end)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*at_grapheme_boundary)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*is_line_sep)(Py_UCS4 ch);
|
|
BOOL (*at_line_start)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*at_line_end)(RE_StatePtr state, Py_ssize_t text_pos);
|
|
BOOL (*possible_turkic)(Py_UCS4 ch);
|
|
int (*all_cases)(Py_UCS4 ch, Py_UCS4* codepoints);
|
|
Py_UCS4 (*simple_case_fold)(Py_UCS4 ch);
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
int (*all_turkic_i)(Py_UCS4 ch, Py_UCS4* cases);
|
|
} RE_EncodingTable;
|
|
|
|
/* Position within the regex and text. */
|
|
typedef struct RE_Position {
|
|
struct RE_Node* node;
|
|
Py_ssize_t text_pos;
|
|
} RE_Position;
|
|
|
|
/* Info about fuzzy matching. */
|
|
typedef struct RE_FuzzyInfo {
|
|
struct RE_Node* node;
|
|
size_t counts[RE_FUZZY_COUNT + 1]; /* Add 1 for total errors. */
|
|
size_t total_cost;
|
|
} RE_FuzzyInfo;
|
|
|
|
/* Storage for backtrack data. */
|
|
typedef struct RE_BacktrackData {
|
|
union {
|
|
struct {
|
|
size_t capture_change;
|
|
BOOL too_few_errors;
|
|
} atomic;
|
|
struct {
|
|
RE_Position position;
|
|
} branch;
|
|
struct {
|
|
RE_FuzzyInfo fuzzy_info;
|
|
Py_ssize_t text_pos;
|
|
RE_CODE index;
|
|
} fuzzy;
|
|
struct {
|
|
RE_Position position;
|
|
size_t count;
|
|
struct RE_Node* fuzzy_node;
|
|
BOOL too_few_errors;
|
|
} fuzzy_insert;
|
|
struct {
|
|
RE_Position position;
|
|
RE_INT8 fuzzy_type;
|
|
RE_INT8 step;
|
|
} fuzzy_item;
|
|
struct {
|
|
RE_Position position;
|
|
Py_ssize_t string_pos;
|
|
RE_INT8 fuzzy_type;
|
|
RE_INT8 folded_pos;
|
|
RE_INT8 folded_len;
|
|
RE_INT8 gfolded_pos;
|
|
RE_INT8 gfolded_len;
|
|
RE_INT8 step;
|
|
} fuzzy_string;
|
|
struct {
|
|
Py_ssize_t text_pos;
|
|
Py_ssize_t current_capture;
|
|
RE_CODE private_index;
|
|
RE_CODE public_index;
|
|
BOOL capture;
|
|
} group;
|
|
struct {
|
|
struct RE_Node* node;
|
|
size_t capture_change;
|
|
} group_call;
|
|
struct {
|
|
size_t capture_change;
|
|
BOOL too_few_errors;
|
|
} lookaround;
|
|
struct {
|
|
RE_Position position;
|
|
Py_ssize_t text_pos;
|
|
size_t count;
|
|
Py_ssize_t start;
|
|
size_t capture_change;
|
|
RE_CODE index;
|
|
} repeat;
|
|
};
|
|
RE_UINT8 op;
|
|
} RE_BacktrackData;
|
|
|
|
/* Storage for backtrack data is allocated in blocks for speed. */
|
|
typedef struct RE_BacktrackBlock {
|
|
RE_BacktrackData items[RE_BACKTRACK_BLOCK_SIZE];
|
|
struct RE_BacktrackBlock* previous;
|
|
struct RE_BacktrackBlock* next;
|
|
size_t capacity;
|
|
size_t count;
|
|
} RE_BacktrackBlock;
|
|
|
|
/* Storage for saved groups. */
|
|
typedef struct RE_SavedGroups {
|
|
struct RE_SavedGroups* previous;
|
|
struct RE_SavedGroups* next;
|
|
struct RE_GroupSpan* spans;
|
|
size_t* counts;
|
|
} RE_SavedGroups;
|
|
|
|
/* Storage for info around a recursive by 'basic'match'. */
|
|
typedef struct RE_Info {
|
|
RE_BacktrackBlock* current_backtrack_block;
|
|
size_t backtrack_count;
|
|
RE_SavedGroups* current_saved_groups;
|
|
struct RE_GroupCallFrame* current_group_call_frame;
|
|
BOOL must_advance;
|
|
} RE_Info;
|
|
|
|
/* Storage for the next node. */
|
|
typedef struct RE_NextNode {
|
|
struct RE_Node* node;
|
|
struct RE_Node* test;
|
|
struct RE_Node* match_next;
|
|
Py_ssize_t match_step;
|
|
} RE_NextNode;
|
|
|
|
/* A pattern node. */
|
|
typedef struct RE_Node {
|
|
RE_NextNode next_1;
|
|
union {
|
|
struct {
|
|
RE_NextNode next_2;
|
|
} nonstring;
|
|
struct {
|
|
/* Used only if (node->status & RE_STATUS_STRING) is true. */
|
|
Py_ssize_t* bad_character_offset;
|
|
Py_ssize_t* good_suffix_offset;
|
|
} string;
|
|
};
|
|
Py_ssize_t step;
|
|
size_t value_count;
|
|
RE_CODE* values;
|
|
RE_STATUS_T status;
|
|
RE_UINT8 op;
|
|
BOOL match;
|
|
} RE_Node;
|
|
|
|
/* Info about a group's span. */
|
|
typedef struct RE_GroupSpan {
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
} RE_GroupSpan;
|
|
|
|
/* Span of a guard (inclusive range). */
|
|
typedef struct RE_GuardSpan {
|
|
Py_ssize_t low;
|
|
Py_ssize_t high;
|
|
BOOL protect;
|
|
} RE_GuardSpan;
|
|
|
|
/* Spans guarded against further matching. */
|
|
typedef struct RE_GuardList {
|
|
size_t capacity;
|
|
size_t count;
|
|
RE_GuardSpan* spans;
|
|
Py_ssize_t last_text_pos;
|
|
size_t last_low;
|
|
} RE_GuardList;
|
|
|
|
/* Info about a group in a context. */
|
|
typedef struct RE_GroupData {
|
|
RE_GroupSpan span;
|
|
size_t capture_count;
|
|
size_t capture_capacity;
|
|
Py_ssize_t current_capture;
|
|
RE_GroupSpan* captures;
|
|
} RE_GroupData;
|
|
|
|
/* Info about a repeat. */
|
|
typedef struct RE_RepeatData {
|
|
RE_GuardList body_guard_list;
|
|
RE_GuardList tail_guard_list;
|
|
size_t count;
|
|
Py_ssize_t start;
|
|
size_t capture_change;
|
|
} RE_RepeatData;
|
|
|
|
/* Storage for saved repeats. */
|
|
typedef struct RE_SavedRepeats {
|
|
struct RE_SavedRepeats* previous;
|
|
struct RE_SavedRepeats* next;
|
|
RE_RepeatData* repeats;
|
|
} RE_SavedRepeats;
|
|
|
|
/* Guards for fuzzy sections. */
|
|
typedef struct RE_FuzzyGuards {
|
|
RE_GuardList body_guard_list;
|
|
RE_GuardList tail_guard_list;
|
|
} RE_FuzzyGuards;
|
|
|
|
/* Info about a capture group. */
|
|
typedef struct RE_GroupInfo {
|
|
Py_ssize_t end_index;
|
|
RE_Node* node;
|
|
BOOL referenced;
|
|
BOOL has_name;
|
|
} RE_GroupInfo;
|
|
|
|
/* Info about a call_ref. */
|
|
typedef struct RE_CallRefInfo {
|
|
RE_Node* node;
|
|
BOOL defined;
|
|
BOOL used;
|
|
} RE_CallRefInfo;
|
|
|
|
/* Info about a repeat. */
|
|
typedef struct RE_RepeatInfo {
|
|
RE_STATUS_T status;
|
|
} RE_RepeatInfo;
|
|
|
|
/* Stack frame for a group call. */
|
|
typedef struct RE_GroupCallFrame {
|
|
struct RE_GroupCallFrame* previous;
|
|
struct RE_GroupCallFrame* next;
|
|
RE_Node* node;
|
|
RE_GroupData* groups;
|
|
RE_RepeatData* repeats;
|
|
} RE_GroupCallFrame;
|
|
|
|
/* Info about a string argument. */
|
|
typedef struct RE_StringInfo {
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
Py_buffer view; /* View of the string if it's a buffer object. */
|
|
#endif
|
|
void* characters; /* Pointer to the characters of the string. */
|
|
Py_ssize_t length; /* Length of the string. */
|
|
Py_ssize_t charsize; /* Size of the characters in the string. */
|
|
BOOL is_unicode; /* Whether the string is Unicode. */
|
|
BOOL should_release; /* Whether the buffer should be released. */
|
|
} RE_StringInfo;
|
|
|
|
/* Info about where the next match was found, starting from a certain search
|
|
* position. This is used when a pattern starts with a BRANCH.
|
|
*/
|
|
#define MAX_SEARCH_POSITIONS 7
|
|
|
|
/* Info about a search position. */
|
|
typedef struct {
|
|
Py_ssize_t start_pos;
|
|
Py_ssize_t match_pos;
|
|
} RE_SearchPosition;
|
|
|
|
/* The state object used during matching. */
|
|
typedef struct RE_State {
|
|
struct PatternObject* pattern; /* Parent PatternObject. */
|
|
/* Info about the string being matched. */
|
|
PyObject* string;
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
Py_buffer view; /* View of the string if it's a buffer object. */
|
|
#endif
|
|
Py_ssize_t charsize;
|
|
void* text;
|
|
Py_ssize_t text_length;
|
|
/* The slice of the string being searched. */
|
|
Py_ssize_t slice_start;
|
|
Py_ssize_t slice_end;
|
|
/* Info about the capture groups. */
|
|
RE_GroupData* groups;
|
|
Py_ssize_t lastindex;
|
|
Py_ssize_t lastgroup;
|
|
/* Info about the repeats. */
|
|
RE_RepeatData* repeats;
|
|
Py_ssize_t search_anchor; /* Where the last match finished. */
|
|
Py_ssize_t match_pos; /* The start position of the match. */
|
|
Py_ssize_t text_pos; /* The current position of the match. */
|
|
Py_ssize_t final_newline; /* The index of newline at end of string, or -1. */
|
|
Py_ssize_t final_line_sep; /* The index of line separator at end of string, or -1. */
|
|
/* Storage for backtrack info. */
|
|
RE_BacktrackBlock backtrack_block;
|
|
RE_BacktrackBlock* current_backtrack_block;
|
|
Py_ssize_t backtrack_allocated;
|
|
RE_BacktrackData* backtrack;
|
|
/* Storage for saved capture groups. */
|
|
RE_SavedGroups* first_saved_groups;
|
|
RE_SavedGroups* current_saved_groups;
|
|
RE_SavedRepeats* first_saved_repeats;
|
|
RE_SavedRepeats* current_saved_repeats;
|
|
Py_ssize_t min_width; /* The minimum width of the string to match (assuming it's not a fuzzy pattern). */
|
|
RE_EncodingTable* encoding; /* The 'encoding' of the string being searched. */
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
|
|
void* (*point_to)(void* text, Py_ssize_t pos);
|
|
PyThread_type_lock lock; /* A lock for accessing the state across threads. */
|
|
RE_FuzzyInfo fuzzy_info; /* Info about fuzzy matching. */
|
|
size_t total_fuzzy_counts[RE_FUZZY_COUNT]; /* Totals for fuzzy matching. */
|
|
RE_FuzzyGuards* fuzzy_guards; /* The guards for a fuzzy match. */
|
|
size_t total_errors; /* The total number of errors of a fuzzy match. */
|
|
size_t total_cost; /* The total cost of a fuzzy match. */
|
|
size_t max_cost; /* The maximum permitted fuzzy cost. */
|
|
/* The group call stack. */
|
|
RE_GroupCallFrame* first_group_call_frame;
|
|
RE_GroupCallFrame* current_group_call_frame;
|
|
RE_GuardList* group_call_guard_list;
|
|
RE_SearchPosition search_positions[MAX_SEARCH_POSITIONS]; /* Where the search matches next. */
|
|
size_t capture_change; /* Incremented every time a captive group changes. */
|
|
Py_ssize_t req_pos; /* The position where the required string matched. */
|
|
Py_ssize_t req_end; /* The end position where the required string matched. */
|
|
int partial_side; /* The side that could truncate in a partial match. */
|
|
RE_UINT16 iterations; /* The number of iterations the matching engine has performed since checking for KeyboardInterrupt. */
|
|
BOOL is_unicode; /* Whether the string to be matched is Unicode. */
|
|
BOOL should_release; /* Whether the buffer should be released. */
|
|
BOOL overlapped; /* Whether the matches can be overlapped. */
|
|
BOOL reverse; /* Whether it's a reverse pattern. */
|
|
BOOL visible_captures; /* Whether the 'captures' method will be visible. */
|
|
BOOL version_0; /* Whether to perform version_0 behaviour (same as re module). */
|
|
BOOL must_advance; /* Whether the end of the match must advance past its start. */
|
|
BOOL is_multithreaded; /* Whether to release the GIL while matching. */
|
|
BOOL too_few_errors; /* Whether there were too few fuzzy errors. */
|
|
BOOL match_all; /* Whether to match all of the string ('fullmatch'). */
|
|
} RE_State;
|
|
|
|
/* Storage for the regex state and thread state.
|
|
*
|
|
* Scanner objects can sometimes be shared across threads, which means that
|
|
* their RE_State structs are also shared. This isn't safe when the GIL is
|
|
* released, so in such instances we have a lock (mutex) in the RE_State struct
|
|
* to protect it during matching. We also need a thread-safe place to store the
|
|
* thread state when releasing the GIL.
|
|
*/
|
|
typedef struct RE_SafeState {
|
|
RE_State* re_state;
|
|
PyThreadState* thread_state;
|
|
} RE_SafeState;
|
|
|
|
/* The PatternObject created from a regular expression. */
|
|
typedef struct PatternObject {
|
|
PyObject_HEAD
|
|
PyObject* pattern; /* Pattern source (or None). */
|
|
Py_ssize_t flags; /* Flags used when compiling pattern source. */
|
|
PyObject* weakreflist; /* List of weak references */
|
|
/* Nodes into which the regular expression is compiled. */
|
|
RE_Node* start_node;
|
|
RE_Node* start_test;
|
|
size_t true_group_count; /* The true number of capture groups. */
|
|
size_t public_group_count; /* The number of public capture groups. */
|
|
size_t repeat_count; /* The number of repeats. */
|
|
Py_ssize_t group_end_index; /* The number of group closures. */
|
|
PyObject* groupindex;
|
|
PyObject* indexgroup;
|
|
PyObject* named_lists;
|
|
size_t named_lists_count;
|
|
PyObject** partial_named_lists[2];
|
|
PyObject* named_list_indexes;
|
|
/* Storage for the pattern nodes. */
|
|
size_t node_capacity;
|
|
size_t node_count;
|
|
RE_Node** node_list;
|
|
/* Info about the capture groups. */
|
|
size_t group_info_capacity;
|
|
RE_GroupInfo* group_info;
|
|
/* Info about the call_refs. */
|
|
size_t call_ref_info_capacity;
|
|
size_t call_ref_info_count;
|
|
RE_CallRefInfo* call_ref_info;
|
|
Py_ssize_t pattern_call_ref;
|
|
/* Info about the repeats. */
|
|
size_t repeat_info_capacity;
|
|
RE_RepeatInfo* repeat_info;
|
|
Py_ssize_t min_width; /* The minimum width of the string to match (assuming it isn't a fuzzy pattern). */
|
|
RE_EncodingTable* encoding; /* Encoding handlers. */
|
|
RE_GroupData* groups_storage;
|
|
RE_RepeatData* repeats_storage;
|
|
size_t fuzzy_count; /* The number of fuzzy sections. */
|
|
Py_ssize_t req_offset; /* The offset to the required string. */
|
|
RE_Node* req_string; /* The required string. */
|
|
BOOL is_fuzzy; /* Whether it's a fuzzy pattern. */
|
|
BOOL do_search_start; /* Whether to do an initial search. */
|
|
BOOL recursive; /* Whether the entire pattern is recursive. */
|
|
} PatternObject;
|
|
|
|
/* The MatchObject created when a match is found. */
|
|
typedef struct MatchObject {
|
|
PyObject_HEAD
|
|
PyObject* string; /* Link to the target string or NULL if detached. */
|
|
PyObject* substring; /* Link to (a substring of) the target string. */
|
|
Py_ssize_t substring_offset; /* Offset into the target string. */
|
|
PatternObject* pattern; /* Link to the regex (pattern) object. */
|
|
Py_ssize_t pos; /* Start of current slice. */
|
|
Py_ssize_t endpos; /* End of current slice. */
|
|
Py_ssize_t match_start; /* Start of matched slice. */
|
|
Py_ssize_t match_end; /* End of matched slice. */
|
|
Py_ssize_t lastindex; /* Last group seen by the engine (-1 if none). */
|
|
Py_ssize_t lastgroup; /* Last named group seen by the engine (-1 if none). */
|
|
size_t group_count; /* The number of groups. */
|
|
RE_GroupData* groups; /* The capture groups. */
|
|
PyObject* regs;
|
|
size_t fuzzy_counts[RE_FUZZY_COUNT];
|
|
BOOL partial; /* Whether it's a partial match. */
|
|
} MatchObject;
|
|
|
|
/* The ScannerObject. */
|
|
typedef struct ScannerObject {
|
|
PyObject_HEAD
|
|
PatternObject* pattern;
|
|
RE_State state;
|
|
int status;
|
|
} ScannerObject;
|
|
|
|
/* The SplitterObject. */
|
|
typedef struct SplitterObject {
|
|
PyObject_HEAD
|
|
PatternObject* pattern;
|
|
RE_State state;
|
|
Py_ssize_t maxsplit;
|
|
Py_ssize_t last_pos;
|
|
Py_ssize_t split_count;
|
|
Py_ssize_t index;
|
|
int status;
|
|
} SplitterObject;
|
|
|
|
/* Info used when compiling a pattern to nodes. */
|
|
typedef struct RE_CompileArgs {
|
|
RE_CODE* code; /* The start of the compiled pattern. */
|
|
RE_CODE* end_code; /* The end of the compiled pattern. */
|
|
PatternObject* pattern; /* The pattern object. */
|
|
Py_ssize_t min_width; /* The minimum width of the string to match (assuming it isn't a fuzzy pattern). */
|
|
RE_Node* start; /* The start node. */
|
|
RE_Node* end; /* The end node. */
|
|
size_t repeat_depth; /* The nesting depth of the repeat. */
|
|
BOOL forward; /* Whether it's a forward (not reverse) pattern. */
|
|
BOOL visible_captures; /* Whether all of the captures will be visible. */
|
|
BOOL has_captures; /* Whether the pattern has capture groups. */
|
|
BOOL is_fuzzy; /* Whether the pattern (or some part of it) is fuzzy. */
|
|
BOOL within_fuzzy; /* Whether the subpattern is within a fuzzy section. */
|
|
} RE_CompileArgs;
|
|
|
|
/* The string slices which will be concatenated to make the result string of
|
|
* the 'sub' method.
|
|
*
|
|
* This allows us to avoid creating a list of slices if there of fewer than 2
|
|
* of them. Empty strings aren't recorded, so if 'list' and 'item' are both
|
|
* NULL then the result is an empty string.
|
|
*/
|
|
typedef struct JoinInfo {
|
|
PyObject* list; /* The list of slices if there are more than 2 of them. */
|
|
PyObject* item; /* The slice if there is only 1 of them. */
|
|
BOOL reversed; /* Whether the slices have been found in reverse order. */
|
|
BOOL is_unicode; /* Whether the string is Unicode. */
|
|
} JoinInfo;
|
|
|
|
/* Info about fuzzy matching. */
|
|
typedef struct {
|
|
RE_Node* new_node;
|
|
Py_ssize_t new_text_pos;
|
|
Py_ssize_t limit;
|
|
Py_ssize_t new_string_pos;
|
|
int step;
|
|
int new_folded_pos;
|
|
int folded_len;
|
|
int new_gfolded_pos;
|
|
int new_group_pos;
|
|
int fuzzy_type;
|
|
BOOL permit_insertion;
|
|
} RE_FuzzyData;
|
|
|
|
/* Function types for getting info from a MatchObject. */
|
|
typedef PyObject* (*RE_GetByIndexFunc)(MatchObject* self, Py_ssize_t index);
|
|
|
|
/* Returns the magnitude of a 'Py_ssize_t' value. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) abs_ssize_t(Py_ssize_t x) {
|
|
return x >= 0 ? x : -x;
|
|
}
|
|
|
|
/* Returns the minimum of 2 'Py_ssize_t' values. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) min_ssize_t(Py_ssize_t x, Py_ssize_t y) {
|
|
return x <= y ? x : y;
|
|
}
|
|
|
|
/* Returns the maximum of 2 'Py_ssize_t' values. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) max_ssize_t(Py_ssize_t x, Py_ssize_t y) {
|
|
return x >= y ? x : y;
|
|
}
|
|
|
|
/* Returns the minimum of 2 'size_t' values. */
|
|
Py_LOCAL_INLINE(size_t) min_size_t(size_t x, size_t y) {
|
|
return x <= y ? x : y;
|
|
}
|
|
|
|
/* Returns the maximum of 2 'size_t' values. */
|
|
Py_LOCAL_INLINE(size_t) max_size_t(size_t x, size_t y) {
|
|
return x >= y ? x : y;
|
|
}
|
|
|
|
/* Returns the 'maximum' of 2 RE_STATUS_T values. */
|
|
Py_LOCAL_INLINE(RE_STATUS_T) max_status_2(RE_STATUS_T x, RE_STATUS_T y) {
|
|
return x >= y ? x : y;
|
|
}
|
|
|
|
/* Returns the 'maximum' of 3 RE_STATUS_T values. */
|
|
Py_LOCAL_INLINE(RE_STATUS_T) max_status_3(RE_STATUS_T x, RE_STATUS_T y,
|
|
RE_STATUS_T z) {
|
|
return max_status_2(x, max_status_2(y, z));
|
|
}
|
|
|
|
/* Returns the 'maximum' of 4 RE_STATUS_T values. */
|
|
Py_LOCAL_INLINE(RE_STATUS_T) max_status_4(RE_STATUS_T w, RE_STATUS_T x,
|
|
RE_STATUS_T y, RE_STATUS_T z) {
|
|
return max_status_2(max_status_2(w, x), max_status_2(y, z));
|
|
}
|
|
|
|
/* Gets a character at a position assuming 1 byte per character. */
|
|
static Py_UCS4 bytes1_char_at(void* text, Py_ssize_t pos) {
|
|
return *((Py_UCS1*)text + pos);
|
|
}
|
|
|
|
/* Sets a character at a position assuming 1 byte per character. */
|
|
static void bytes1_set_char_at(void* text, Py_ssize_t pos, Py_UCS4 ch) {
|
|
*((Py_UCS1*)text + pos) = (Py_UCS1)ch;
|
|
}
|
|
|
|
/* Gets a pointer to a position assuming 1 byte per character. */
|
|
static void* bytes1_point_to(void* text, Py_ssize_t pos) {
|
|
return (Py_UCS1*)text + pos;
|
|
}
|
|
|
|
/* Gets a character at a position assuming 2 bytes per character. */
|
|
static Py_UCS4 bytes2_char_at(void* text, Py_ssize_t pos) {
|
|
return *((Py_UCS2*)text + pos);
|
|
}
|
|
|
|
/* Sets a character at a position assuming 2 bytes per character. */
|
|
static void bytes2_set_char_at(void* text, Py_ssize_t pos, Py_UCS4 ch) {
|
|
*((Py_UCS2*)text + pos) = (Py_UCS2)ch;
|
|
}
|
|
|
|
/* Gets a pointer to a position assuming 2 bytes per character. */
|
|
static void* bytes2_point_to(void* text, Py_ssize_t pos) {
|
|
return (Py_UCS2*)text + pos;
|
|
}
|
|
|
|
/* Gets a character at a position assuming 4 bytes per character. */
|
|
static Py_UCS4 bytes4_char_at(void* text, Py_ssize_t pos) {
|
|
return *((Py_UCS4*)text + pos);
|
|
}
|
|
|
|
/* Sets a character at a position assuming 4 bytes per character. */
|
|
static void bytes4_set_char_at(void* text, Py_ssize_t pos, Py_UCS4 ch) {
|
|
*((Py_UCS4*)text + pos) = (Py_UCS4)ch;
|
|
}
|
|
|
|
/* Gets a pointer to a position assuming 4 bytes per character. */
|
|
static void* bytes4_point_to(void* text, Py_ssize_t pos) {
|
|
return (Py_UCS4*)text + pos;
|
|
}
|
|
|
|
/* Default for whether a position is on a word boundary. */
|
|
static BOOL at_boundary_always(RE_State* state, Py_ssize_t text_pos) {
|
|
return TRUE;
|
|
}
|
|
|
|
/* Converts a BOOL to success/failure. */
|
|
Py_LOCAL_INLINE(int) bool_as_status(BOOL value) {
|
|
return value ? RE_ERROR_SUCCESS : RE_ERROR_FAILURE;
|
|
}
|
|
|
|
/* ASCII-specific. */
|
|
|
|
Py_LOCAL_INLINE(BOOL) unicode_has_property(RE_CODE property, Py_UCS4 ch);
|
|
|
|
/* Checks whether a character has a property. */
|
|
Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) {
|
|
if (ch > RE_ASCII_MAX) {
|
|
/* Outside the ASCII range. */
|
|
RE_UINT32 value;
|
|
|
|
value = property & 0xFFFF;
|
|
|
|
return value == 0;
|
|
}
|
|
|
|
return unicode_has_property(property, ch);
|
|
}
|
|
|
|
/* Wrapper for calling 'ascii_has_property' via a pointer. */
|
|
static BOOL ascii_has_property_wrapper(RE_CODE property, Py_UCS4 ch) {
|
|
return ascii_has_property(property, ch);
|
|
}
|
|
|
|
/* Checks whether there's a word character to the left. */
|
|
Py_LOCAL_INLINE(BOOL) ascii_word_left(RE_State* state, Py_ssize_t text_pos) {
|
|
return text_pos > 0 && ascii_has_property(RE_PROP_WORD,
|
|
state->char_at(state->text, text_pos - 1));
|
|
}
|
|
|
|
/* Checks whether there's a word character to the right. */
|
|
Py_LOCAL_INLINE(BOOL) ascii_word_right(RE_State* state, Py_ssize_t text_pos) {
|
|
return text_pos < state->text_length && ascii_has_property(RE_PROP_WORD,
|
|
state->char_at(state->text, text_pos));
|
|
}
|
|
|
|
/* Checks whether a position is on a word boundary. */
|
|
static BOOL ascii_at_boundary(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = ascii_word_left(state, text_pos);
|
|
right = ascii_word_right(state, text_pos);
|
|
|
|
return left != right;
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a word. */
|
|
static BOOL ascii_at_word_start(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = ascii_word_left(state, text_pos);
|
|
right = ascii_word_right(state, text_pos);
|
|
|
|
return !left && right;
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a word. */
|
|
static BOOL ascii_at_word_end(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = ascii_word_left(state, text_pos);
|
|
right = ascii_word_right(state, text_pos);
|
|
|
|
return left && !right;
|
|
}
|
|
|
|
/* Checks whether a character is a line separator. */
|
|
static BOOL ascii_is_line_sep(Py_UCS4 ch) {
|
|
return 0x0A <= ch && ch <= 0x0D;
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a line. */
|
|
static BOOL ascii_at_line_start(RE_State* state, Py_ssize_t text_pos) {
|
|
Py_UCS4 ch;
|
|
|
|
if (text_pos <= 0)
|
|
return TRUE;
|
|
|
|
ch = state->char_at(state->text, text_pos - 1);
|
|
|
|
if (ch == 0x0D) {
|
|
if (text_pos >= state->text_length)
|
|
return TRUE;
|
|
|
|
/* No line break inside CRLF. */
|
|
return state->char_at(state->text, text_pos) != 0x0A;
|
|
}
|
|
|
|
return 0x0A <= ch && ch <= 0x0D;
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a line. */
|
|
static BOOL ascii_at_line_end(RE_State* state, Py_ssize_t text_pos) {
|
|
Py_UCS4 ch;
|
|
|
|
if (text_pos >= state->text_length)
|
|
return TRUE;
|
|
|
|
ch = state->char_at(state->text, text_pos);
|
|
|
|
if (ch == 0x0A) {
|
|
if (text_pos <= 0)
|
|
return TRUE;
|
|
|
|
/* No line break inside CRLF. */
|
|
return state->char_at(state->text, text_pos - 1) != 0x0D;
|
|
}
|
|
|
|
return 0x0A <= ch && ch <= 0x0D;
|
|
}
|
|
|
|
/* Checks whether a character could be Turkic (variants of I/i). For ASCII, it
|
|
* won't be.
|
|
*/
|
|
static BOOL ascii_possible_turkic(Py_UCS4 ch) {
|
|
return FALSE;
|
|
}
|
|
|
|
/* Gets all the cases of a character. */
|
|
static int ascii_all_cases(Py_UCS4 ch, Py_UCS4* codepoints) {
|
|
int count;
|
|
|
|
count = 0;
|
|
|
|
codepoints[count++] = ch;
|
|
|
|
if (('A' <= ch && ch <= 'Z') || ('a' <= ch && ch <= 'z'))
|
|
/* It's a letter, so add the other case. */
|
|
codepoints[count++] = ch ^ 0x20;
|
|
|
|
return count;
|
|
}
|
|
|
|
/* Returns a character with its case folded. */
|
|
static Py_UCS4 ascii_simple_case_fold(Py_UCS4 ch) {
|
|
if ('A' <= ch && ch <= 'Z')
|
|
/* Uppercase folds to lowercase. */
|
|
return ch ^ 0x20;
|
|
|
|
return ch;
|
|
}
|
|
|
|
/* Returns a character with its case folded. */
|
|
static int ascii_full_case_fold(Py_UCS4 ch, Py_UCS4* folded) {
|
|
if ('A' <= ch && ch <= 'Z')
|
|
/* Uppercase folds to lowercase. */
|
|
folded[0] = ch ^ 0x20;
|
|
else
|
|
folded[0] = ch;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* Gets all the case variants of Turkic 'I'. The given character will be listed
|
|
* first.
|
|
*/
|
|
static int ascii_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
|
|
int count;
|
|
|
|
count = 0;
|
|
|
|
cases[count++] = ch;
|
|
|
|
if (ch != 'I')
|
|
cases[count++] = 'I';
|
|
|
|
if (ch != 'i')
|
|
cases[count++] = 'i';
|
|
|
|
return count;
|
|
}
|
|
|
|
/* The handlers for ASCII characters. */
|
|
static RE_EncodingTable ascii_encoding = {
|
|
ascii_has_property_wrapper,
|
|
ascii_at_boundary,
|
|
ascii_at_word_start,
|
|
ascii_at_word_end,
|
|
ascii_at_boundary, /* No special "default word boundary" for ASCII. */
|
|
ascii_at_word_start, /* No special "default start of word" for ASCII. */
|
|
ascii_at_word_end, /* No special "default end of a word" for ASCII. */
|
|
at_boundary_always, /* No special "grapheme boundary" for ASCII. */
|
|
ascii_is_line_sep,
|
|
ascii_at_line_start,
|
|
ascii_at_line_end,
|
|
ascii_possible_turkic,
|
|
ascii_all_cases,
|
|
ascii_simple_case_fold,
|
|
ascii_full_case_fold,
|
|
ascii_all_turkic_i,
|
|
};
|
|
|
|
/* Locale-specific. */
|
|
|
|
/* Checks whether a character has a property. */
|
|
Py_LOCAL_INLINE(BOOL) locale_has_property(RE_CODE property, Py_UCS4 ch) {
|
|
RE_UINT32 value;
|
|
RE_UINT32 v;
|
|
|
|
value = property & 0xFFFF;
|
|
|
|
if (ch > RE_LOCALE_MAX)
|
|
/* Outside the locale range. */
|
|
return value == 0;
|
|
|
|
switch (property >> 16) {
|
|
case RE_PROP_ALNUM >> 16:
|
|
v = isalnum((int)ch) != 0;
|
|
break;
|
|
case RE_PROP_ALPHA >> 16:
|
|
v = isalpha((int)ch) != 0;
|
|
break;
|
|
case RE_PROP_ANY >> 16:
|
|
v = 1;
|
|
break;
|
|
case RE_PROP_ASCII >> 16:
|
|
v = ch <= RE_ASCII_MAX;
|
|
break;
|
|
case RE_PROP_BLANK >> 16:
|
|
v = ch == '\t' || ch == ' ';
|
|
break;
|
|
case RE_PROP_GC:
|
|
switch (property) {
|
|
case RE_PROP_ASSIGNED:
|
|
v = ch <= RE_LOCALE_MAX;
|
|
break;
|
|
case RE_PROP_CASEDLETTER:
|
|
v = isalpha((int)ch) ? value : 0xFFFF;
|
|
break;
|
|
case RE_PROP_CNTRL:
|
|
v = iscntrl((int)ch) ? value : 0xFFFF;
|
|
break;
|
|
case RE_PROP_DIGIT:
|
|
v = isdigit((int)ch) ? value : 0xFFFF;
|
|
break;
|
|
case RE_PROP_GC_CN:
|
|
v = ch > RE_LOCALE_MAX;
|
|
break;
|
|
case RE_PROP_GC_LL:
|
|
v = islower((int)ch) ? value : 0xFFFF;
|
|
break;
|
|
case RE_PROP_GC_LU:
|
|
v = isupper((int)ch) ? value : 0xFFFF;
|
|
break;
|
|
case RE_PROP_GC_P:
|
|
v = ispunct((int)ch) ? value : 0xFFFF;
|
|
break;
|
|
default:
|
|
v = 0xFFFF;
|
|
break;
|
|
}
|
|
break;
|
|
case RE_PROP_GRAPH >> 16:
|
|
v = isgraph((int)ch) != 0;
|
|
break;
|
|
case RE_PROP_LOWER >> 16:
|
|
v = islower((int)ch) != 0;
|
|
break;
|
|
case RE_PROP_PRINT >> 16:
|
|
v = isprint((int)ch) != 0;
|
|
break;
|
|
case RE_PROP_SPACE >> 16:
|
|
v = isspace((int)ch) != 0;
|
|
break;
|
|
case RE_PROP_UPPER >> 16:
|
|
v = isupper((int)ch) != 0;
|
|
break;
|
|
case RE_PROP_WORD >> 16:
|
|
v = ch == '_' || isalnum((int)ch) != 0;
|
|
break;
|
|
case RE_PROP_XDIGIT >> 16:
|
|
v = re_get_hex_digit(ch) != 0;
|
|
break;
|
|
default:
|
|
v = 0;
|
|
break;
|
|
}
|
|
|
|
return v == value;
|
|
}
|
|
|
|
/* Wrapper for calling 'locale_has_property' via a pointer. */
|
|
static BOOL locale_has_property_wrapper(RE_CODE property, Py_UCS4 ch) {
|
|
return locale_has_property(property, ch);
|
|
}
|
|
|
|
/* Checks whether there's a word character to the left. */
|
|
Py_LOCAL_INLINE(BOOL) locale_word_left(RE_State* state, Py_ssize_t text_pos) {
|
|
return text_pos > 0 && locale_has_property(RE_PROP_WORD,
|
|
state->char_at(state->text, text_pos - 1));
|
|
}
|
|
|
|
/* Checks whether there's a word character to the right. */
|
|
Py_LOCAL_INLINE(BOOL) locale_word_right(RE_State* state, Py_ssize_t text_pos) {
|
|
return text_pos < state->text_length && locale_has_property(RE_PROP_WORD,
|
|
state->char_at(state->text, text_pos));
|
|
}
|
|
|
|
/* Checks whether a position is on a word boundary. */
|
|
static BOOL locale_at_boundary(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = locale_word_left(state, text_pos);
|
|
right = locale_word_right(state, text_pos);
|
|
|
|
return left != right;
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a word. */
|
|
static BOOL locale_at_word_start(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = locale_word_left(state, text_pos);
|
|
right = locale_word_right(state, text_pos);
|
|
|
|
return !left && right;
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a word. */
|
|
static BOOL locale_at_word_end(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = locale_word_left(state, text_pos);
|
|
right = locale_word_right(state, text_pos);
|
|
|
|
return left && !right;
|
|
}
|
|
|
|
/* Checks whether a character could be Turkic (variants of I/i). */
|
|
static BOOL locale_possible_turkic(Py_UCS4 ch) {
|
|
return toupper((int)ch) == 'I' || tolower((int)ch) == 'i';
|
|
}
|
|
|
|
/* Gets all the cases of a character. */
|
|
static int locale_all_cases(Py_UCS4 ch, Py_UCS4* codepoints) {
|
|
int count;
|
|
Py_UCS4 other;
|
|
|
|
count = 0;
|
|
|
|
codepoints[count++] = ch;
|
|
|
|
other = (Py_UCS4)toupper((int)ch);
|
|
if (other != ch)
|
|
codepoints[count++] = other;
|
|
|
|
other = (Py_UCS4)tolower((int)ch);
|
|
if (other != ch)
|
|
codepoints[count++] = other;
|
|
|
|
return count;
|
|
}
|
|
|
|
/* Returns a character with its case folded. */
|
|
static Py_UCS4 locale_simple_case_fold(Py_UCS4 ch) {
|
|
if (ch <= RE_LOCALE_MAX)
|
|
return (Py_UCS4)tolower((int)ch);
|
|
|
|
return ch;
|
|
}
|
|
|
|
/* Returns a character with its case folded. */
|
|
static int locale_full_case_fold(Py_UCS4 ch, Py_UCS4* folded) {
|
|
if (ch <= RE_LOCALE_MAX)
|
|
folded[0] = (Py_UCS4)tolower((int)ch);
|
|
else
|
|
folded[0] = ch;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* Gets all the case variants of Turkic 'I'. The given character will be listed
|
|
* first.
|
|
*/
|
|
static int locale_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
|
|
int count;
|
|
Py_UCS4 other;
|
|
|
|
count = 0;
|
|
|
|
cases[count++] = ch;
|
|
|
|
if (ch != 'I')
|
|
cases[count++] = 'I';
|
|
|
|
if (ch != 'i')
|
|
cases[count++] = 'i';
|
|
|
|
/* Uppercase 'i' will be either dotted (Turkic) or dotless (non-Turkic). */
|
|
other = (Py_UCS4)toupper('i');
|
|
if (other != ch && other != 'I')
|
|
cases[count++] = other;
|
|
|
|
/* Lowercase 'I' will be either dotless (Turkic) or dotted (non-Turkic). */
|
|
other = (Py_UCS4)tolower('I');
|
|
if (other != ch && other != 'i')
|
|
cases[count++] = other;
|
|
|
|
return count;
|
|
}
|
|
|
|
/* The handlers for locale characters. */
|
|
static RE_EncodingTable locale_encoding = {
|
|
locale_has_property_wrapper,
|
|
locale_at_boundary,
|
|
locale_at_word_start,
|
|
locale_at_word_end,
|
|
locale_at_boundary, /* No special "default word boundary" for locale. */
|
|
locale_at_word_start, /* No special "default start of a word" for locale. */
|
|
locale_at_word_end, /* No special "default end of a word" for locale. */
|
|
at_boundary_always, /* No special "grapheme boundary" for locale. */
|
|
ascii_is_line_sep, /* Assume locale line separators are same as ASCII. */
|
|
ascii_at_line_start, /* Assume locale line separators are same as ASCII. */
|
|
ascii_at_line_end, /* Assume locale line separators are same as ASCII. */
|
|
locale_possible_turkic,
|
|
locale_all_cases,
|
|
locale_simple_case_fold,
|
|
locale_full_case_fold,
|
|
locale_all_turkic_i,
|
|
};
|
|
|
|
/* Unicode-specific. */
|
|
|
|
/* Checks whether a Unicode character has a property. */
|
|
Py_LOCAL_INLINE(BOOL) unicode_has_property(RE_CODE property, Py_UCS4 ch) {
|
|
RE_UINT32 prop;
|
|
RE_UINT32 value;
|
|
RE_UINT32 v;
|
|
|
|
prop = property >> 16;
|
|
if (prop >= sizeof(re_get_property) / sizeof(re_get_property[0]))
|
|
return FALSE;
|
|
|
|
value = property & 0xFFFF;
|
|
v = re_get_property[prop](ch);
|
|
|
|
if (v == value)
|
|
return TRUE;
|
|
|
|
if (prop == RE_PROP_GC) {
|
|
switch (value) {
|
|
case RE_PROP_ASSIGNED:
|
|
return v != RE_PROP_CN;
|
|
case RE_PROP_C:
|
|
return (RE_PROP_C_MASK & (1 << v)) != 0;
|
|
case RE_PROP_CASEDLETTER:
|
|
return v == RE_PROP_LU || v == RE_PROP_LL || v == RE_PROP_LT;
|
|
case RE_PROP_L:
|
|
return (RE_PROP_L_MASK & (1 << v)) != 0;
|
|
case RE_PROP_M:
|
|
return (RE_PROP_M_MASK & (1 << v)) != 0;
|
|
case RE_PROP_N:
|
|
return (RE_PROP_N_MASK & (1 << v)) != 0;
|
|
case RE_PROP_P:
|
|
return (RE_PROP_P_MASK & (1 << v)) != 0;
|
|
case RE_PROP_S:
|
|
return (RE_PROP_S_MASK & (1 << v)) != 0;
|
|
case RE_PROP_Z:
|
|
return (RE_PROP_Z_MASK & (1 << v)) != 0;
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Wrapper for calling 'unicode_has_property' via a pointer. */
|
|
static BOOL unicode_has_property_wrapper(RE_CODE property, Py_UCS4 ch) {
|
|
return unicode_has_property(property, ch);
|
|
}
|
|
|
|
/* Checks whether there's a word character to the left. */
|
|
Py_LOCAL_INLINE(BOOL) unicode_word_left(RE_State* state, Py_ssize_t text_pos) {
|
|
return text_pos > 0 && unicode_has_property(RE_PROP_WORD,
|
|
state->char_at(state->text, text_pos - 1));
|
|
}
|
|
|
|
/* Checks whether there's a word character to the right. */
|
|
Py_LOCAL_INLINE(BOOL) unicode_word_right(RE_State* state, Py_ssize_t text_pos)
|
|
{
|
|
return text_pos < state->text_length && unicode_has_property(RE_PROP_WORD,
|
|
state->char_at(state->text, text_pos));
|
|
}
|
|
|
|
/* Checks whether a position is on a word boundary. */
|
|
static BOOL unicode_at_boundary(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = unicode_word_left(state, text_pos);
|
|
right = unicode_word_right(state, text_pos);
|
|
|
|
return left != right;
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a word. */
|
|
static BOOL unicode_at_word_start(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = unicode_word_left(state, text_pos);
|
|
right = unicode_word_right(state, text_pos);
|
|
|
|
return !left && right;
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a word. */
|
|
static BOOL unicode_at_word_end(RE_State* state, Py_ssize_t text_pos) {
|
|
BOOL left;
|
|
BOOL right;
|
|
|
|
left = unicode_word_left(state, text_pos);
|
|
right = unicode_word_right(state, text_pos);
|
|
|
|
return left && !right;
|
|
}
|
|
|
|
/* Checks whether a character is a Unicode vowel.
|
|
*
|
|
* Only a limited number are treated as vowels.
|
|
*/
|
|
Py_LOCAL_INLINE(BOOL) is_unicode_vowel(Py_UCS4 ch) {
|
|
switch (Py_UNICODE_TOLOWER((Py_UNICODE)ch)) {
|
|
case 'a': case 0xE0: case 0xE1: case 0xE2:
|
|
case 'e': case 0xE8: case 0xE9: case 0xEA:
|
|
case 'i': case 0xEC: case 0xED: case 0xEE:
|
|
case 'o': case 0xF2: case 0xF3: case 0xF4:
|
|
case 'u': case 0xF9: case 0xFA: case 0xFB:
|
|
return TRUE;
|
|
default:
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
/* Checks whether a position is on a default word boundary.
|
|
*
|
|
* The rules are defined here:
|
|
* http://www.unicode.org/reports/tr29/#Default_Word_Boundaries
|
|
*/
|
|
static BOOL unicode_at_default_boundary(RE_State* state, Py_ssize_t text_pos) {
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
int prop;
|
|
int prop_m1;
|
|
Py_ssize_t pos_m1;
|
|
Py_ssize_t pos_m2;
|
|
int prop_m2;
|
|
Py_ssize_t pos_p0;
|
|
int prop_p0;
|
|
Py_ssize_t pos_p1;
|
|
int prop_p1;
|
|
|
|
/* Break at the start and end of the text. */
|
|
if (text_pos <= 0)
|
|
return TRUE;
|
|
|
|
if (text_pos >= state->text_length)
|
|
return TRUE;
|
|
|
|
char_at = state->char_at;
|
|
|
|
prop = (int)re_get_word_break(char_at(state->text, text_pos));
|
|
prop_m1 = (int)re_get_word_break(char_at(state->text, text_pos - 1));
|
|
|
|
/* Don't break within CRLF. */
|
|
if (prop_m1 == RE_BREAK_CR && prop == RE_BREAK_LF)
|
|
return FALSE;
|
|
|
|
/* Otherwise break before and after Newlines (including CR and LF). */
|
|
if (prop_m1 == RE_BREAK_NEWLINE || prop_m1 == RE_BREAK_CR || prop_m1 ==
|
|
RE_BREAK_LF || prop == RE_BREAK_NEWLINE || prop == RE_BREAK_CR || prop ==
|
|
RE_BREAK_LF)
|
|
return TRUE;
|
|
|
|
/* Get the property of the previous character. */
|
|
pos_m1 = text_pos - 1;
|
|
prop_m1 = RE_BREAK_OTHER;
|
|
while (pos_m1 >= 0) {
|
|
prop_m1 = (int)re_get_word_break(char_at(state->text, pos_m1));
|
|
if (prop_m1 != RE_BREAK_EXTEND && prop_m1 != RE_BREAK_FORMAT)
|
|
break;
|
|
|
|
--pos_m1;
|
|
}
|
|
|
|
/* Get the property of the preceding character. */
|
|
pos_m2 = pos_m1 - 1;
|
|
prop_m2 = RE_BREAK_OTHER;
|
|
while (pos_m2 >= 0) {
|
|
prop_m2 = (int)re_get_word_break(char_at(state->text, pos_m2));
|
|
if (prop_m2 != RE_BREAK_EXTEND && prop_m2 != RE_BREAK_FORMAT)
|
|
break;
|
|
|
|
--pos_m2;
|
|
}
|
|
|
|
/* Get the property of the next character. */
|
|
pos_p0 = text_pos;
|
|
prop_p0 = prop;
|
|
while (pos_p0 < state->text_length) {
|
|
prop_p0 = (int)re_get_word_break(char_at(state->text, pos_p0));
|
|
if (prop_p0 != RE_BREAK_EXTEND && prop_p0 != RE_BREAK_FORMAT)
|
|
break;
|
|
|
|
++pos_p0;
|
|
}
|
|
|
|
/* Get the property of the following character. */
|
|
pos_p1 = pos_p0 + 1;
|
|
prop_p1 = RE_BREAK_OTHER;
|
|
while (pos_p1 < state->text_length) {
|
|
prop_p1 = (int)re_get_word_break(char_at(state->text, pos_p1));
|
|
if (prop_p1 != RE_BREAK_EXTEND && prop_p1 != RE_BREAK_FORMAT)
|
|
break;
|
|
|
|
++pos_p1;
|
|
}
|
|
|
|
/* Don't break between most letters. */
|
|
if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_HEBREWLETTER) &&
|
|
(prop_p0 == RE_BREAK_ALETTER || prop_p0 == RE_BREAK_HEBREWLETTER))
|
|
return FALSE;
|
|
|
|
/* Break between apostrophe and vowels (French, Italian). */
|
|
if (pos_m1 >= 0 && char_at(state->text, pos_m1) == '\'' &&
|
|
is_unicode_vowel(char_at(state->text, text_pos)))
|
|
return TRUE;
|
|
|
|
/* Don't break letters across certain punctuation. */
|
|
if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_HEBREWLETTER) &&
|
|
(prop_p0 == RE_BREAK_MIDLETTER || prop_p0 == RE_BREAK_MIDNUMLET ||
|
|
prop_p0 == RE_BREAK_SINGLEQUOTE) && (prop_p1 == RE_BREAK_ALETTER ||
|
|
prop_p1 == RE_BREAK_HEBREWLETTER))
|
|
return FALSE;
|
|
if ((prop_m2 == RE_BREAK_ALETTER || prop_m2 == RE_BREAK_HEBREWLETTER) &&
|
|
(prop_m1 == RE_BREAK_MIDLETTER || prop_m1 == RE_BREAK_MIDNUMLET ||
|
|
prop_m1 == RE_BREAK_SINGLEQUOTE) && (prop_p0 == RE_BREAK_ALETTER ||
|
|
prop_p0 == RE_BREAK_HEBREWLETTER))
|
|
return FALSE;
|
|
if (prop_m1 == RE_BREAK_HEBREWLETTER && prop_p0 == RE_BREAK_SINGLEQUOTE)
|
|
return FALSE;
|
|
if (prop_m1 == RE_BREAK_HEBREWLETTER && prop_p0 == RE_BREAK_DOUBLEQUOTE &&
|
|
prop_p1 == RE_BREAK_HEBREWLETTER)
|
|
return FALSE;
|
|
if (prop_m2 == RE_BREAK_HEBREWLETTER && prop_m1 == RE_BREAK_DOUBLEQUOTE &&
|
|
prop_p0 == RE_BREAK_HEBREWLETTER)
|
|
return FALSE;
|
|
|
|
/* Don't break within sequences of digits, or digits adjacent to letters
|
|
* ("3a", or "A3").
|
|
*/
|
|
if (prop_m1 == RE_BREAK_NUMERIC && prop_p0 == RE_BREAK_NUMERIC)
|
|
return FALSE;
|
|
if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_HEBREWLETTER) &&
|
|
prop_p0 == RE_BREAK_NUMERIC)
|
|
return FALSE;
|
|
if (prop_m1 == RE_BREAK_NUMERIC && (prop_p0 == RE_BREAK_ALETTER || prop_p0
|
|
== RE_BREAK_HEBREWLETTER))
|
|
return FALSE;
|
|
|
|
/* Don't break within sequences, such as "3.2" or "3,456.789". */
|
|
if (prop_m2 == RE_BREAK_NUMERIC && (prop_m1 == RE_BREAK_MIDNUM || prop_m1
|
|
== RE_BREAK_MIDNUMLET || prop_m1 == RE_BREAK_SINGLEQUOTE) && prop_p0 ==
|
|
RE_BREAK_NUMERIC)
|
|
return FALSE;
|
|
if (prop_m1 == RE_BREAK_NUMERIC && (prop_p0 == RE_BREAK_MIDNUM || prop_p0
|
|
== RE_BREAK_MIDNUMLET || prop_p0 == RE_BREAK_SINGLEQUOTE) && prop_p1 ==
|
|
RE_BREAK_NUMERIC)
|
|
return FALSE;
|
|
|
|
/* Don't break between Katakana. */
|
|
if (prop_m1 == RE_BREAK_KATAKANA && prop_p0 == RE_BREAK_KATAKANA)
|
|
return FALSE;
|
|
|
|
/* Don't break from extenders. */
|
|
if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_HEBREWLETTER ||
|
|
prop_m1 == RE_BREAK_NUMERIC || prop_m1 == RE_BREAK_KATAKANA || prop_m1 ==
|
|
RE_BREAK_EXTENDNUMLET) && prop_p0 == RE_BREAK_EXTENDNUMLET)
|
|
return FALSE;
|
|
if (prop_m1 == RE_BREAK_EXTENDNUMLET && (prop_p0 == RE_BREAK_ALETTER ||
|
|
prop_p0 == RE_BREAK_HEBREWLETTER || prop_p0 == RE_BREAK_NUMERIC ||
|
|
prop_p0 == RE_BREAK_KATAKANA))
|
|
return FALSE;
|
|
|
|
/* Don't break between regional indicator symbols. */
|
|
if (prop_m1 == RE_BREAK_REGIONALINDICATOR && prop_p0 ==
|
|
RE_BREAK_REGIONALINDICATOR)
|
|
return FALSE;
|
|
|
|
/* Otherwise, break everywhere (including around ideographs). */
|
|
return TRUE;
|
|
}
|
|
|
|
/* Checks whether a position is at the start/end of a word. */
|
|
Py_LOCAL_INLINE(BOOL) unicode_at_default_word_start_or_end(RE_State* state,
|
|
Py_ssize_t text_pos, BOOL at_start) {
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
BOOL before;
|
|
BOOL after;
|
|
Py_UCS4 char_0;
|
|
Py_UCS4 char_m1;
|
|
int prop;
|
|
int prop_m1;
|
|
Py_ssize_t pos_m1;
|
|
Py_UCS4 char_p1;
|
|
Py_ssize_t pos_p1;
|
|
int prop_p1;
|
|
Py_ssize_t pos_m2;
|
|
Py_UCS4 char_m2;
|
|
int prop_m2;
|
|
|
|
char_at = state->char_at;
|
|
|
|
/* At the start or end of the text. */
|
|
if (text_pos <= 0 || text_pos >= state->text_length) {
|
|
before = unicode_word_left(state, text_pos);
|
|
after = unicode_word_right(state, text_pos);
|
|
|
|
return before != at_start && after == at_start;
|
|
}
|
|
|
|
char_0 = char_at(state->text, text_pos);
|
|
char_m1 = char_at(state->text, text_pos - 1);
|
|
prop = (int)re_get_word_break(char_0);
|
|
prop_m1 = (int)re_get_word_break(char_m1);
|
|
|
|
/* No break within CRLF. */
|
|
if (prop_m1 == RE_BREAK_CR && prop == RE_BREAK_LF)
|
|
return FALSE;
|
|
|
|
/* Break before and after Newlines (including CR and LF). */
|
|
if (prop_m1 == RE_BREAK_NEWLINE || prop_m1 == RE_BREAK_CR || prop_m1 ==
|
|
RE_BREAK_LF || prop == RE_BREAK_NEWLINE || prop == RE_BREAK_CR || prop ==
|
|
RE_BREAK_LF) {
|
|
before = unicode_has_property(RE_PROP_WORD, char_m1);
|
|
after = unicode_has_property(RE_PROP_WORD, char_0);
|
|
|
|
return before != at_start && after == at_start;
|
|
}
|
|
|
|
/* No break just before Format or Extend characters. */
|
|
if (prop == RE_BREAK_EXTEND || prop == RE_BREAK_FORMAT)
|
|
return FALSE;
|
|
|
|
/* Get the property of the previous character. */
|
|
pos_m1 = text_pos - 1;
|
|
prop_m1 = RE_BREAK_OTHER;
|
|
while (pos_m1 >= 0) {
|
|
char_m1 = char_at(state->text, pos_m1);
|
|
prop_m1 = (int)re_get_word_break(char_m1);
|
|
if (prop_m1 != RE_BREAK_EXTEND && prop_m1 != RE_BREAK_FORMAT)
|
|
break;
|
|
|
|
--pos_m1;
|
|
}
|
|
|
|
/* No break between most letters. */
|
|
if (prop_m1 == RE_BREAK_ALETTER && prop == RE_BREAK_ALETTER)
|
|
return FALSE;
|
|
|
|
if (pos_m1 >= 0 && char_m1 == '\'' && is_unicode_vowel(char_0))
|
|
return TRUE;
|
|
|
|
pos_p1 = text_pos + 1;
|
|
prop_p1 = RE_BREAK_OTHER;
|
|
while (pos_p1 < state->text_length) {
|
|
char_p1 = char_at(state->text, pos_p1);
|
|
prop_p1 = (int)re_get_word_break(char_p1);
|
|
if (prop_p1 != RE_BREAK_EXTEND && prop_p1 != RE_BREAK_FORMAT)
|
|
break;
|
|
|
|
++pos_p1;
|
|
}
|
|
|
|
/* No break letters across certain punctuation. */
|
|
if (prop_m1 == RE_BREAK_ALETTER && (prop == RE_BREAK_MIDLETTER || prop ==
|
|
RE_BREAK_MIDNUMLET) && prop_p1 == RE_BREAK_ALETTER)
|
|
return FALSE;
|
|
|
|
pos_m2 = pos_m1 - 1;
|
|
prop_m2 = RE_BREAK_OTHER;
|
|
while (pos_m2 >= 0) {
|
|
char_m2 = char_at(state->text, pos_m2);
|
|
prop_m2 = (int)re_get_word_break(char_m2);
|
|
if (prop_m2 != RE_BREAK_EXTEND && prop_m1 != RE_BREAK_FORMAT)
|
|
break;
|
|
|
|
--pos_m2;
|
|
}
|
|
|
|
if (prop_m2 == RE_BREAK_ALETTER && (prop_m1 == RE_BREAK_MIDLETTER ||
|
|
prop_m1 == RE_BREAK_MIDNUMLET) && prop == RE_BREAK_ALETTER)
|
|
return FALSE;
|
|
|
|
/* No break within sequences of digits, or digits adjacent to letters
|
|
* ("3a", or "A3").
|
|
*/
|
|
if ((prop_m1 == RE_BREAK_NUMERIC || prop_m1 == RE_BREAK_ALETTER) && prop ==
|
|
RE_BREAK_NUMERIC)
|
|
return FALSE;
|
|
|
|
if (prop_m1 == RE_BREAK_NUMERIC && prop == RE_BREAK_ALETTER)
|
|
return FALSE;
|
|
|
|
/* No break within sequences, such as "3.2" or "3,456.789". */
|
|
if (prop_m2 == RE_BREAK_NUMERIC && (prop_m1 == RE_BREAK_MIDNUM || prop_m1
|
|
== RE_BREAK_MIDNUMLET) && prop == RE_BREAK_NUMERIC)
|
|
return FALSE;
|
|
|
|
if (prop_m1 == RE_BREAK_NUMERIC && (prop == RE_BREAK_MIDNUM || prop ==
|
|
RE_BREAK_MIDNUMLET) && prop_p1 == RE_BREAK_NUMERIC)
|
|
return FALSE;
|
|
|
|
/* No break between Katakana. */
|
|
if (prop_m1 == RE_BREAK_KATAKANA && prop == RE_BREAK_KATAKANA)
|
|
return FALSE;
|
|
|
|
/* No break from extenders. */
|
|
if ((prop_m1 == RE_BREAK_ALETTER || prop_m1 == RE_BREAK_NUMERIC || prop_m1
|
|
== RE_BREAK_KATAKANA || prop_m1 == RE_BREAK_EXTENDNUMLET) && prop ==
|
|
RE_BREAK_EXTENDNUMLET)
|
|
return FALSE;
|
|
|
|
if (prop_m1 == RE_BREAK_EXTENDNUMLET && (prop == RE_BREAK_ALETTER || prop
|
|
== RE_BREAK_NUMERIC || prop == RE_BREAK_KATAKANA))
|
|
return FALSE;
|
|
|
|
/* Otherwise, break everywhere (including around ideographs). */
|
|
before = unicode_has_property(RE_PROP_WORD, char_m1);
|
|
after = unicode_has_property(RE_PROP_WORD, char_0);
|
|
|
|
return before != at_start && after == at_start;
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a word. */
|
|
static BOOL unicode_at_default_word_start(RE_State* state, Py_ssize_t text_pos)
|
|
{
|
|
return unicode_at_default_word_start_or_end(state, text_pos, TRUE);
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a word. */
|
|
static BOOL unicode_at_default_word_end(RE_State* state, Py_ssize_t text_pos) {
|
|
return unicode_at_default_word_start_or_end(state, text_pos, FALSE);
|
|
}
|
|
|
|
/* Checks whether a position is on a grapheme boundary.
|
|
*
|
|
* The rules are defined here:
|
|
* http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
|
|
*/
|
|
static BOOL unicode_at_grapheme_boundary(RE_State* state, Py_ssize_t text_pos)
|
|
{
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
int prop;
|
|
int prop_m1;
|
|
|
|
/* Break at the start and end of the text. */
|
|
if (text_pos <= 0)
|
|
return TRUE;
|
|
|
|
if (text_pos >= state->text_length)
|
|
return TRUE;
|
|
|
|
char_at = state->char_at;
|
|
|
|
prop = (int)re_get_grapheme_cluster_break(char_at(state->text, text_pos));
|
|
prop_m1 = (int)re_get_grapheme_cluster_break(char_at(state->text, text_pos
|
|
- 1));
|
|
|
|
/* Don't break within CRLF. */
|
|
if (prop_m1 == RE_GBREAK_CR && prop == RE_GBREAK_LF)
|
|
return FALSE;
|
|
|
|
/* Otherwise break before and after controls (including CR and LF). */
|
|
if (prop_m1 == RE_GBREAK_CONTROL || prop_m1 == RE_GBREAK_CR || prop_m1 ==
|
|
RE_GBREAK_LF || prop == RE_GBREAK_CONTROL || prop == RE_GBREAK_CR || prop
|
|
== RE_GBREAK_LF)
|
|
return TRUE;
|
|
|
|
/* Don't break Hangul syllable sequences. */
|
|
if (prop_m1 == RE_GBREAK_L && (prop == RE_GBREAK_L || prop == RE_GBREAK_V
|
|
|| prop == RE_GBREAK_LV || prop == RE_GBREAK_LVT))
|
|
return FALSE;
|
|
if ((prop_m1 == RE_GBREAK_LV || prop_m1 == RE_GBREAK_V) && (prop ==
|
|
RE_GBREAK_V || prop == RE_GBREAK_T))
|
|
return FALSE;
|
|
if ((prop_m1 == RE_GBREAK_LVT || prop_m1 == RE_GBREAK_T) && (prop ==
|
|
RE_GBREAK_T))
|
|
return FALSE;
|
|
|
|
/* Don't break between regional indicator symbols. */
|
|
if (prop_m1 == RE_GBREAK_REGIONALINDICATOR && prop ==
|
|
RE_GBREAK_REGIONALINDICATOR)
|
|
return FALSE;
|
|
|
|
/* Don't break just before Extend characters. */
|
|
if (prop == RE_GBREAK_EXTEND)
|
|
return FALSE;
|
|
|
|
/* Don't break before SpacingMarks, or after Prepend characters. */
|
|
if (prop == RE_GBREAK_SPACINGMARK)
|
|
return FALSE;
|
|
|
|
if (prop_m1 == RE_GBREAK_PREPEND)
|
|
return FALSE;
|
|
|
|
/* Otherwise, break everywhere. */
|
|
return TRUE;
|
|
}
|
|
|
|
/* Checks whether a character is a line separator. */
|
|
static BOOL unicode_is_line_sep(Py_UCS4 ch) {
|
|
return (0x0A <= ch && ch <= 0x0D) || ch == 0x85 || ch == 0x2028 || ch ==
|
|
0x2029;
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a line. */
|
|
static BOOL unicode_at_line_start(RE_State* state, Py_ssize_t text_pos) {
|
|
Py_UCS4 ch;
|
|
|
|
if (text_pos <= 0)
|
|
return TRUE;
|
|
|
|
ch = state->char_at(state->text, text_pos - 1);
|
|
|
|
if (ch == 0x0D) {
|
|
if (text_pos >= state->text_length)
|
|
return TRUE;
|
|
|
|
/* No line break inside CRLF. */
|
|
return state->char_at(state->text, text_pos) != 0x0A;
|
|
}
|
|
|
|
return (0x0A <= ch && ch <= 0x0D) || ch == 0x85 || ch == 0x2028 || ch ==
|
|
0x2029;
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a line. */
|
|
static BOOL unicode_at_line_end(RE_State* state, Py_ssize_t text_pos) {
|
|
Py_UCS4 ch;
|
|
|
|
if (text_pos >= state->text_length)
|
|
return TRUE;
|
|
|
|
ch = state->char_at(state->text, text_pos);
|
|
|
|
if (ch == 0x0A) {
|
|
if (text_pos <= 0)
|
|
return TRUE;
|
|
|
|
/* No line break inside CRLF. */
|
|
return state->char_at(state->text, text_pos - 1) != 0x0D;
|
|
}
|
|
|
|
return (0x0A <= ch && ch <= 0x0D) || ch == 0x85 || ch == 0x2028 || ch ==
|
|
0x2029;
|
|
}
|
|
|
|
/* Checks whether a character could be Turkic (variants of I/i). */
|
|
static BOOL unicode_possible_turkic(Py_UCS4 ch) {
|
|
return ch == 'I' || ch == 'i' || ch == 0x0130 || ch == 0x0131;
|
|
}
|
|
|
|
/* Gets all the cases of a character. */
|
|
static int unicode_all_cases(Py_UCS4 ch, Py_UCS4* codepoints) {
|
|
return re_get_all_cases(ch, codepoints);
|
|
}
|
|
|
|
/* Returns a character with its case folded, unless it could be Turkic
|
|
* (variants of I/i).
|
|
*/
|
|
static Py_UCS4 unicode_simple_case_fold(Py_UCS4 ch) {
|
|
/* Is it a possible Turkic character? If so, pass it through unchanged. */
|
|
if (ch == 'I' || ch == 'i' || ch == 0x0130 || ch == 0x0131)
|
|
return ch;
|
|
|
|
return (Py_UCS4)re_get_simple_case_folding(ch);
|
|
}
|
|
|
|
/* Returns a character with its case folded, unless it could be Turkic
|
|
* (variants of I/i).
|
|
*/
|
|
static int unicode_full_case_fold(Py_UCS4 ch, Py_UCS4* folded) {
|
|
/* Is it a possible Turkic character? If so, pass it through unchanged. */
|
|
if (ch == 'I' || ch == 'i' || ch == 0x0130 || ch == 0x0131) {
|
|
folded[0] = ch;
|
|
return 1;
|
|
}
|
|
|
|
return re_get_full_case_folding(ch, folded);
|
|
}
|
|
|
|
/* Gets all the case variants of Turkic 'I'. */
|
|
static int unicode_all_turkic_i(Py_UCS4 ch, Py_UCS4* cases) {
|
|
int count;
|
|
|
|
count = 0;
|
|
|
|
cases[count++] = ch;
|
|
|
|
if (ch != 'I')
|
|
cases[count++] = 'I';
|
|
|
|
if (ch != 'i')
|
|
cases[count++] = 'i';
|
|
|
|
if (ch != 0x130)
|
|
cases[count++] = 0x130;
|
|
|
|
if (ch != 0x131)
|
|
cases[count++] = 0x131;
|
|
|
|
return count;
|
|
|
|
}
|
|
|
|
/* The handlers for Unicode characters. */
|
|
static RE_EncodingTable unicode_encoding = {
|
|
unicode_has_property_wrapper,
|
|
unicode_at_boundary,
|
|
unicode_at_word_start,
|
|
unicode_at_word_end,
|
|
unicode_at_default_boundary,
|
|
unicode_at_default_word_start,
|
|
unicode_at_default_word_end,
|
|
unicode_at_grapheme_boundary,
|
|
unicode_is_line_sep,
|
|
unicode_at_line_start,
|
|
unicode_at_line_end,
|
|
unicode_possible_turkic,
|
|
unicode_all_cases,
|
|
unicode_simple_case_fold,
|
|
unicode_full_case_fold,
|
|
unicode_all_turkic_i,
|
|
};
|
|
|
|
Py_LOCAL_INLINE(PyObject*) get_object(char* module_name, char* object_name);
|
|
|
|
/* Sets the error message. */
|
|
Py_LOCAL_INLINE(void) set_error(int status, PyObject* object) {
|
|
TRACE(("<<set_error>>\n"))
|
|
|
|
if (!error_exception)
|
|
error_exception = get_object("_" RE_MODULE "_core", "error");
|
|
|
|
switch (status) {
|
|
case RE_ERROR_BACKTRACKING:
|
|
PyErr_SetString(error_exception, "too much backtracking");
|
|
break;
|
|
case RE_ERROR_CONCURRENT:
|
|
PyErr_SetString(PyExc_ValueError, "concurrent not int or None");
|
|
break;
|
|
case RE_ERROR_GROUP_INDEX_TYPE:
|
|
if (object)
|
|
PyErr_Format(PyExc_TypeError,
|
|
"group indices must be integers or strings, not %.200s",
|
|
object->ob_type->tp_name);
|
|
else
|
|
PyErr_Format(PyExc_TypeError,
|
|
"group indices must be integers or strings");
|
|
break;
|
|
case RE_ERROR_ILLEGAL:
|
|
PyErr_SetString(PyExc_RuntimeError, "invalid RE code");
|
|
break;
|
|
case RE_ERROR_INDEX:
|
|
PyErr_SetString(PyExc_TypeError, "string indices must be integers");
|
|
break;
|
|
case RE_ERROR_INTERRUPTED:
|
|
/* An exception has already been raised, so let it fly. */
|
|
break;
|
|
case RE_ERROR_INVALID_GROUP_REF:
|
|
PyErr_SetString(error_exception, "invalid group reference");
|
|
break;
|
|
case RE_ERROR_MEMORY:
|
|
PyErr_NoMemory();
|
|
break;
|
|
case RE_ERROR_NOT_STRING:
|
|
PyErr_Format(PyExc_TypeError, "expected string instance, %.200s found",
|
|
object->ob_type->tp_name);
|
|
break;
|
|
case RE_ERROR_NOT_UNICODE:
|
|
PyErr_Format(PyExc_TypeError,
|
|
"expected unicode instance, %.200s found", object->ob_type->tp_name);
|
|
break;
|
|
case RE_ERROR_NO_SUCH_GROUP:
|
|
PyErr_SetString(PyExc_IndexError, "no such group");
|
|
break;
|
|
case RE_ERROR_REPLACEMENT:
|
|
PyErr_SetString(error_exception, "invalid replacement");
|
|
break;
|
|
default:
|
|
/* Other error codes indicate compiler/engine bugs. */
|
|
PyErr_SetString(PyExc_RuntimeError,
|
|
"internal error in regular expression engine");
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Allocates memory.
|
|
*
|
|
* Sets the Python error handler and returns NULL if the allocation fails.
|
|
*/
|
|
Py_LOCAL_INLINE(void*) re_alloc(size_t size) {
|
|
void* new_ptr;
|
|
|
|
new_ptr = PyMem_Malloc(size);
|
|
if (!new_ptr)
|
|
set_error(RE_ERROR_MEMORY, NULL);
|
|
|
|
return new_ptr;
|
|
}
|
|
|
|
/* Reallocates memory.
|
|
*
|
|
* Sets the Python error handler and returns NULL if the reallocation fails.
|
|
*/
|
|
Py_LOCAL_INLINE(void*) re_realloc(void* ptr, size_t size) {
|
|
void* new_ptr;
|
|
|
|
new_ptr = PyMem_Realloc(ptr, size);
|
|
if (!new_ptr)
|
|
set_error(RE_ERROR_MEMORY, NULL);
|
|
|
|
return new_ptr;
|
|
}
|
|
|
|
/* Deallocates memory. */
|
|
Py_LOCAL_INLINE(void) re_dealloc(void* ptr) {
|
|
PyMem_Free(ptr);
|
|
}
|
|
|
|
/* Releases the GIL if multithreading is enabled. */
|
|
Py_LOCAL_INLINE(void) release_GIL(RE_SafeState* safe_state) {
|
|
if (safe_state->re_state->is_multithreaded)
|
|
safe_state->thread_state = PyEval_SaveThread();
|
|
}
|
|
|
|
/* Acquires the GIL if multithreading is enabled. */
|
|
Py_LOCAL_INLINE(void) acquire_GIL(RE_SafeState* safe_state) {
|
|
if (safe_state->re_state->is_multithreaded)
|
|
PyEval_RestoreThread(safe_state->thread_state);
|
|
}
|
|
|
|
/* Allocates memory, holding the GIL during the allocation.
|
|
*
|
|
* Sets the Python error handler and returns NULL if the allocation fails.
|
|
*/
|
|
Py_LOCAL_INLINE(void*) safe_alloc(RE_SafeState* safe_state, size_t size) {
|
|
void* new_ptr;
|
|
|
|
acquire_GIL(safe_state);
|
|
|
|
new_ptr = re_alloc(size);
|
|
|
|
release_GIL(safe_state);
|
|
|
|
return new_ptr;
|
|
}
|
|
|
|
/* Reallocates memory, holding the GIL during the reallocation.
|
|
*
|
|
* Sets the Python error handler and returns NULL if the reallocation fails.
|
|
*/
|
|
Py_LOCAL_INLINE(void*) safe_realloc(RE_SafeState* safe_state, void* ptr, size_t
|
|
size) {
|
|
void* new_ptr;
|
|
|
|
acquire_GIL(safe_state);
|
|
|
|
new_ptr = re_realloc(ptr, size);
|
|
|
|
release_GIL(safe_state);
|
|
|
|
return new_ptr;
|
|
}
|
|
|
|
/* Deallocates memory, holding the GIL during the deallocation. */
|
|
Py_LOCAL_INLINE(void) safe_dealloc(RE_SafeState* safe_state, void* ptr) {
|
|
acquire_GIL(safe_state);
|
|
|
|
re_dealloc(ptr);
|
|
|
|
release_GIL(safe_state);
|
|
}
|
|
|
|
/* Checks for KeyboardInterrupt, holding the GIL during the check. */
|
|
Py_LOCAL_INLINE(BOOL) safe_check_signals(RE_SafeState* safe_state) {
|
|
BOOL result;
|
|
|
|
acquire_GIL(safe_state);
|
|
|
|
result = (BOOL)PyErr_CheckSignals();
|
|
|
|
release_GIL(safe_state);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Checks whether a character is in a range. */
|
|
Py_LOCAL_INLINE(BOOL) in_range(RE_EncodingTable* encoding, Py_UCS4 lower,
|
|
Py_UCS4 upper, Py_UCS4 ch) {
|
|
return lower <= ch && ch <= upper;
|
|
}
|
|
|
|
/* Checks whether a character is in a range, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) in_range_ign(RE_EncodingTable* encoding, Py_UCS4 lower,
|
|
Py_UCS4 upper, Py_UCS4 ch) {
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int count;
|
|
int i;
|
|
|
|
count = encoding->all_cases(ch, cases);
|
|
|
|
for (i = 0; i < count; i++) {
|
|
if (in_range(encoding, lower, upper, cases[i]))
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Checks whether 2 characters are the same. */
|
|
Py_LOCAL_INLINE(BOOL) same_char(RE_EncodingTable* encoding, Py_UCS4 ch1,
|
|
Py_UCS4 ch2) {
|
|
return ch1 == ch2;
|
|
}
|
|
|
|
/* Wrapper for calling 'same_char' via a pointer. */
|
|
static BOOL same_char_wrapper(RE_EncodingTable* encoding, Py_UCS4 ch1, Py_UCS4
|
|
ch2) {
|
|
return same_char(encoding, ch1, ch2);
|
|
}
|
|
|
|
/* Checks whether 2 characters are the same, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) same_char_ign(RE_EncodingTable* encoding, Py_UCS4 ch1,
|
|
Py_UCS4 ch2) {
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int count;
|
|
int i;
|
|
|
|
if (ch1 == ch2)
|
|
return TRUE;
|
|
|
|
count = encoding->all_cases(ch1, cases);
|
|
|
|
for (i = 1; i < count; i++) {
|
|
if (cases[i] == ch2)
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Wrapper for calling 'same_char' via a pointer. */
|
|
static BOOL same_char_ign_wrapper(RE_EncodingTable* encoding, Py_UCS4 ch1,
|
|
Py_UCS4 ch2) {
|
|
return same_char_ign(encoding, ch1, ch2);
|
|
}
|
|
|
|
/* Checks whether a character is anything except a newline. */
|
|
Py_LOCAL_INLINE(BOOL) matches_ANY(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch) {
|
|
return ch != '\n';
|
|
}
|
|
|
|
/* Checks whether a character is anything except a line separator. */
|
|
Py_LOCAL_INLINE(BOOL) matches_ANY_U(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch) {
|
|
return !encoding->is_line_sep(ch);
|
|
}
|
|
|
|
/* Checks whether 2 characters are the same. */
|
|
Py_LOCAL_INLINE(BOOL) matches_CHARACTER(RE_EncodingTable* encoding, RE_Node*
|
|
node, Py_UCS4 ch) {
|
|
return same_char(encoding, node->values[0], ch);
|
|
}
|
|
|
|
/* Checks whether 2 characters are the same, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) matches_CHARACTER_IGN(RE_EncodingTable* encoding,
|
|
RE_Node* node, Py_UCS4 ch) {
|
|
return same_char_ign(encoding, node->values[0], ch);
|
|
}
|
|
|
|
/* Checks whether a character has a property. */
|
|
Py_LOCAL_INLINE(BOOL) matches_PROPERTY(RE_EncodingTable* encoding, RE_Node*
|
|
node, Py_UCS4 ch) {
|
|
return encoding->has_property(node->values[0], ch);
|
|
}
|
|
|
|
/* Checks whether a character has a property, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) matches_PROPERTY_IGN(RE_EncodingTable* encoding, RE_Node*
|
|
node, Py_UCS4 ch) {
|
|
RE_UINT32 property;
|
|
RE_UINT32 prop;
|
|
|
|
property = node->values[0];
|
|
prop = property >> 16;
|
|
|
|
/* We need to do special handling of case-sensitive properties according to
|
|
* the 'encoding'.
|
|
*/
|
|
if (encoding == &unicode_encoding) {
|
|
/* We are working with Unicode. */
|
|
if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property
|
|
== RE_PROP_GC_LT) {
|
|
RE_UINT32 value;
|
|
|
|
value = re_get_general_category(ch);
|
|
|
|
return value == RE_PROP_LU || value == RE_PROP_LL || value ==
|
|
RE_PROP_LT;
|
|
} else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE)
|
|
return (BOOL)re_get_cased(ch);
|
|
|
|
/* The property is case-insensitive. */
|
|
return unicode_has_property(property, ch);
|
|
} else if (encoding == &ascii_encoding) {
|
|
/* We are working with ASCII. */
|
|
if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property
|
|
== RE_PROP_GC_LT) {
|
|
RE_UINT32 value;
|
|
|
|
value = re_get_general_category(ch);
|
|
|
|
return value == RE_PROP_LU || value == RE_PROP_LL || value ==
|
|
RE_PROP_LT;
|
|
} else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE)
|
|
return (BOOL)re_get_cased(ch);
|
|
|
|
/* The property is case-insensitive. */
|
|
return ascii_has_property(property, ch);
|
|
} else {
|
|
/* We are working with Locale. */
|
|
if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property
|
|
== RE_PROP_GC_LT)
|
|
return ch <= RE_LOCALE_MAX && (isupper((int)ch) ||
|
|
islower((int)ch)) != 0;
|
|
else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE)
|
|
return ch <= RE_LOCALE_MAX && (isupper((int)ch) ||
|
|
islower((int)ch)) != 0;
|
|
|
|
/* The property is case-insensitive. */
|
|
return locale_has_property(property, ch);
|
|
}
|
|
}
|
|
|
|
/* Checks whether a character is in a range. */
|
|
Py_LOCAL_INLINE(BOOL) matches_RANGE(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch) {
|
|
return in_range(encoding, node->values[0], node->values[1], ch);
|
|
}
|
|
|
|
/* Checks whether a character is in a range, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) matches_RANGE_IGN(RE_EncodingTable* encoding, RE_Node*
|
|
node, Py_UCS4 ch) {
|
|
return in_range_ign(encoding, node->values[0], node->values[1], ch);
|
|
}
|
|
|
|
Py_LOCAL_INLINE(BOOL) in_set_diff(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch);
|
|
Py_LOCAL_INLINE(BOOL) in_set_inter(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch);
|
|
Py_LOCAL_INLINE(BOOL) in_set_sym_diff(RE_EncodingTable* encoding, RE_Node*
|
|
node, Py_UCS4 ch);
|
|
Py_LOCAL_INLINE(BOOL) in_set_union(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch);
|
|
|
|
/* Checks whether a character matches a set member. */
|
|
Py_LOCAL_INLINE(BOOL) matches_member(RE_EncodingTable* encoding, RE_Node*
|
|
member, Py_UCS4 ch) {
|
|
switch (member->op) {
|
|
case RE_OP_CHARACTER:
|
|
/* values are: char_code */
|
|
TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
|
|
member->values[0]))
|
|
return ch == member->values[0];
|
|
case RE_OP_PROPERTY:
|
|
/* values are: property */
|
|
TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
|
|
member->values[0]))
|
|
return encoding->has_property(member->values[0], ch);
|
|
case RE_OP_RANGE:
|
|
/* values are: lower, upper */
|
|
TRACE(("%s %d %d %d\n", re_op_text[member->op], member->match,
|
|
member->values[0], member->values[1]))
|
|
return in_range(encoding, member->values[0], member->values[1], ch);
|
|
case RE_OP_SET_DIFF:
|
|
TRACE(("%s\n", re_op_text[member->op]))
|
|
return in_set_diff(encoding, member, ch);
|
|
case RE_OP_SET_INTER:
|
|
TRACE(("%s\n", re_op_text[member->op]))
|
|
return in_set_inter(encoding, member, ch);
|
|
case RE_OP_SET_SYM_DIFF:
|
|
TRACE(("%s\n", re_op_text[member->op]))
|
|
return in_set_sym_diff(encoding, member, ch);
|
|
case RE_OP_SET_UNION:
|
|
TRACE(("%s\n", re_op_text[member->op]))
|
|
return in_set_union(encoding, member, ch);
|
|
case RE_OP_STRING:
|
|
{
|
|
/* values are: char_code, char_code, ... */
|
|
size_t i;
|
|
TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
|
|
member->value_count))
|
|
|
|
for (i = 0; i < member->value_count; i++) {
|
|
if (ch == member->values[i])
|
|
return TRUE;
|
|
}
|
|
return FALSE;
|
|
}
|
|
default:
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
/* Checks whether a character matches a set member, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) matches_member_ign(RE_EncodingTable* encoding, RE_Node*
|
|
member, int case_count, Py_UCS4* cases) {
|
|
int i;
|
|
|
|
for (i = 0; i < case_count; i++) {
|
|
switch (member->op) {
|
|
case RE_OP_CHARACTER:
|
|
/* values are: char_code */
|
|
TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
|
|
member->values[0]))
|
|
if (cases[i] == member->values[0])
|
|
return TRUE;
|
|
break;
|
|
case RE_OP_PROPERTY:
|
|
/* values are: property */
|
|
TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
|
|
member->values[0]))
|
|
if (encoding->has_property(member->values[0], cases[i]))
|
|
return TRUE;
|
|
break;
|
|
case RE_OP_RANGE:
|
|
/* values are: lower, upper */
|
|
TRACE(("%s %d %d %d\n", re_op_text[member->op], member->match,
|
|
member->values[0], member->values[1]))
|
|
if (in_range(encoding, member->values[0], member->values[1],
|
|
cases[i]))
|
|
return TRUE;
|
|
break;
|
|
case RE_OP_SET_DIFF:
|
|
TRACE(("%s\n", re_op_text[member->op]))
|
|
if (in_set_diff(encoding, member, cases[i]))
|
|
return TRUE;
|
|
break;
|
|
case RE_OP_SET_INTER:
|
|
TRACE(("%s\n", re_op_text[member->op]))
|
|
if (in_set_inter(encoding, member, cases[i]))
|
|
return TRUE;
|
|
break;
|
|
case RE_OP_SET_SYM_DIFF:
|
|
TRACE(("%s\n", re_op_text[member->op]))
|
|
if (in_set_sym_diff(encoding, member, cases[i]))
|
|
return TRUE;
|
|
break;
|
|
case RE_OP_SET_UNION:
|
|
TRACE(("%s\n", re_op_text[member->op]))
|
|
if (in_set_union(encoding, member, cases[i]))
|
|
return TRUE;
|
|
break;
|
|
case RE_OP_STRING:
|
|
{
|
|
size_t j;
|
|
TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
|
|
member->value_count))
|
|
|
|
for (j = 0; j < member->value_count; j++) {
|
|
if (cases[i] == member->values[j])
|
|
return TRUE;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Checks whether a character is in a set difference. */
|
|
Py_LOCAL_INLINE(BOOL) in_set_diff(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch) {
|
|
RE_Node* member;
|
|
|
|
member = node->nonstring.next_2.node;
|
|
|
|
if (matches_member(encoding, member, ch) != member->match)
|
|
return FALSE;
|
|
|
|
member = member->next_1.node;
|
|
|
|
while (member) {
|
|
if (matches_member(encoding, member, ch) == member->match)
|
|
return FALSE;
|
|
|
|
member = member->next_1.node;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Checks whether a character is in a set difference, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) in_set_diff_ign(RE_EncodingTable* encoding, RE_Node*
|
|
node, int case_count, Py_UCS4* cases) {
|
|
RE_Node* member;
|
|
|
|
member = node->nonstring.next_2.node;
|
|
|
|
if (matches_member_ign(encoding, member, case_count, cases) !=
|
|
member->match)
|
|
return FALSE;
|
|
|
|
member = member->next_1.node;
|
|
|
|
while (member) {
|
|
if (matches_member_ign(encoding, member, case_count, cases) ==
|
|
member->match)
|
|
return FALSE;
|
|
|
|
member = member->next_1.node;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Checks whether a character is in a set intersection. */
|
|
Py_LOCAL_INLINE(BOOL) in_set_inter(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch) {
|
|
RE_Node* member;
|
|
|
|
member = node->nonstring.next_2.node;
|
|
|
|
while (member) {
|
|
if (matches_member(encoding, member, ch) != member->match)
|
|
return FALSE;
|
|
|
|
member = member->next_1.node;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Checks whether a character is in a set intersection, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) in_set_inter_ign(RE_EncodingTable* encoding, RE_Node*
|
|
node, int case_count, Py_UCS4* cases) {
|
|
RE_Node* member;
|
|
|
|
member = node->nonstring.next_2.node;
|
|
|
|
while (member) {
|
|
if (matches_member_ign(encoding, member, case_count, cases) !=
|
|
member->match)
|
|
return FALSE;
|
|
|
|
member = member->next_1.node;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Checks whether a character is in a set symmetric difference. */
|
|
Py_LOCAL_INLINE(BOOL) in_set_sym_diff(RE_EncodingTable* encoding, RE_Node*
|
|
node, Py_UCS4 ch) {
|
|
RE_Node* member;
|
|
BOOL result;
|
|
|
|
member = node->nonstring.next_2.node;
|
|
|
|
result = FALSE;
|
|
|
|
while (member) {
|
|
if (matches_member(encoding, member, ch) == member->match)
|
|
result = !result;
|
|
|
|
member = member->next_1.node;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Checks whether a character is in a set symmetric difference, ignoring case.
|
|
*/
|
|
Py_LOCAL_INLINE(BOOL) in_set_sym_diff_ign(RE_EncodingTable* encoding, RE_Node*
|
|
node, int case_count, Py_UCS4* cases) {
|
|
RE_Node* member;
|
|
BOOL result;
|
|
|
|
member = node->nonstring.next_2.node;
|
|
|
|
result = FALSE;
|
|
|
|
while (member) {
|
|
if (matches_member_ign(encoding, member, case_count, cases) ==
|
|
member->match)
|
|
result = !result;
|
|
|
|
member = member->next_1.node;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Checks whether a character is in a set union. */
|
|
Py_LOCAL_INLINE(BOOL) in_set_union(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch) {
|
|
RE_Node* member;
|
|
|
|
member = node->nonstring.next_2.node;
|
|
|
|
while (member) {
|
|
if (matches_member(encoding, member, ch) == member->match)
|
|
return TRUE;
|
|
|
|
member = member->next_1.node;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Checks whether a character is in a set union, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) in_set_union_ign(RE_EncodingTable* encoding, RE_Node*
|
|
node, int case_count, Py_UCS4* cases) {
|
|
RE_Node* member;
|
|
|
|
member = node->nonstring.next_2.node;
|
|
|
|
while (member) {
|
|
if (matches_member_ign(encoding, member, case_count, cases) ==
|
|
member->match)
|
|
return TRUE;
|
|
|
|
member = member->next_1.node;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Checks whether a character is in a set. */
|
|
Py_LOCAL_INLINE(BOOL) matches_SET(RE_EncodingTable* encoding, RE_Node* node,
|
|
Py_UCS4 ch) {
|
|
switch (node->op) {
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_DIFF_REV:
|
|
return in_set_diff(encoding, node, ch);
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_INTER_REV:
|
|
return in_set_inter(encoding, node, ch);
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
return in_set_sym_diff(encoding, node, ch);
|
|
case RE_OP_SET_UNION:
|
|
case RE_OP_SET_UNION_REV:
|
|
return in_set_union(encoding, node, ch);
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Checks whether a character is in a set, ignoring case. */
|
|
Py_LOCAL_INLINE(BOOL) matches_SET_IGN(RE_EncodingTable* encoding, RE_Node*
|
|
node, Py_UCS4 ch) {
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int case_count;
|
|
|
|
case_count = encoding->all_cases(ch, cases);
|
|
|
|
switch (node->op) {
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
return in_set_diff_ign(encoding, node, case_count, cases);
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
return in_set_inter_ign(encoding, node, case_count, cases);
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
return in_set_sym_diff_ign(encoding, node, case_count, cases);
|
|
case RE_OP_SET_UNION_IGN:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
return in_set_union_ign(encoding, node, case_count, cases);
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Resets a guard list. */
|
|
Py_LOCAL_INLINE(void) reset_guard_list(RE_GuardList* guard_list) {
|
|
guard_list->count = 0;
|
|
guard_list->last_text_pos = -1;
|
|
}
|
|
|
|
/* Initialises the state for a match. */
|
|
Py_LOCAL_INLINE(void) init_match(RE_State* state) {
|
|
size_t i;
|
|
|
|
/* Reset the backtrack. */
|
|
state->current_backtrack_block = &state->backtrack_block;
|
|
state->current_backtrack_block->count = 0;
|
|
state->current_saved_groups = state->first_saved_groups;
|
|
state->backtrack = NULL;
|
|
state->search_anchor = state->text_pos;
|
|
state->match_pos = state->text_pos;
|
|
|
|
/* Reset the guards for the repeats. */
|
|
for (i = 0; i < state->pattern->repeat_count; i++) {
|
|
reset_guard_list(&state->repeats[i].body_guard_list);
|
|
reset_guard_list(&state->repeats[i].tail_guard_list);
|
|
}
|
|
|
|
/* Reset the guards for the fuzzy sections. */
|
|
for (i = 0; i < state->pattern->fuzzy_count; i++) {
|
|
reset_guard_list(&state->fuzzy_guards[i].body_guard_list);
|
|
reset_guard_list(&state->fuzzy_guards[i].tail_guard_list);
|
|
}
|
|
|
|
for (i = 0; i < state->pattern->true_group_count; i++) {
|
|
RE_GroupData* group;
|
|
|
|
group = &state->groups[i];
|
|
group->span.start = -1;
|
|
group->span.end = -1;
|
|
group->capture_count = 0;
|
|
group->current_capture = -1;
|
|
}
|
|
|
|
/* Reset the guards for the group calls. */
|
|
for (i = 0; i < state->pattern->call_ref_info_count; i++)
|
|
reset_guard_list(&state->group_call_guard_list[i]);
|
|
|
|
/* Clear the counts and cost for matching. */
|
|
memset(state->fuzzy_info.counts, 0, sizeof(state->fuzzy_info.counts));
|
|
state->fuzzy_info.total_cost = 0;
|
|
memset(state->total_fuzzy_counts, 0, sizeof(state->total_fuzzy_counts));
|
|
state->total_errors = 0;
|
|
state->total_cost = 0;
|
|
state->too_few_errors = FALSE;
|
|
state->capture_change = 0;
|
|
state->iterations = 0;
|
|
}
|
|
|
|
/* Adds a new backtrack entry. */
|
|
Py_LOCAL_INLINE(BOOL) add_backtrack(RE_SafeState* safe_state, RE_UINT8 op) {
|
|
RE_State* state;
|
|
RE_BacktrackBlock* current;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
current = state->current_backtrack_block;
|
|
if (current->count >= current->capacity) {
|
|
if (!current->next) {
|
|
RE_BacktrackBlock* next;
|
|
|
|
/* Is there too much backtracking? */
|
|
if (state->backtrack_allocated >= RE_MAX_BACKTRACK_ALLOC)
|
|
return FALSE;
|
|
|
|
next = (RE_BacktrackBlock*)safe_alloc(safe_state,
|
|
sizeof(RE_BacktrackBlock));
|
|
if (!next)
|
|
return FALSE;
|
|
|
|
next->previous = current;
|
|
next->next = NULL;
|
|
next->capacity = RE_BACKTRACK_BLOCK_SIZE;
|
|
current->next = next;
|
|
|
|
state->backtrack_allocated += RE_BACKTRACK_BLOCK_SIZE;
|
|
}
|
|
|
|
current = current->next;
|
|
current->count = 0;
|
|
state->current_backtrack_block = current;
|
|
}
|
|
|
|
state->backtrack = ¤t->items[current->count++];
|
|
state->backtrack->op = op;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Gets the last backtrack entry.
|
|
*
|
|
* It'll never be called when there are _no_ entries.
|
|
*/
|
|
Py_LOCAL_INLINE(RE_BacktrackData*) last_backtrack(RE_State* state) {
|
|
RE_BacktrackBlock* current;
|
|
|
|
current = state->current_backtrack_block;
|
|
state->backtrack = ¤t->items[current->count - 1];
|
|
|
|
return state->backtrack;
|
|
}
|
|
|
|
/* Discards the last backtrack entry.
|
|
*
|
|
* It'll never be called to discard the _only_ entry.
|
|
*/
|
|
Py_LOCAL_INLINE(void) discard_backtrack(RE_State* state) {
|
|
RE_BacktrackBlock* current;
|
|
|
|
current = state->current_backtrack_block;
|
|
--current->count;
|
|
if (current->count == 0 && current->previous)
|
|
state->current_backtrack_block = current->previous;
|
|
}
|
|
|
|
/* Copies a repeat guard list. */
|
|
Py_LOCAL_INLINE(BOOL) copy_guard_data(RE_SafeState* safe_state, RE_GuardList*
|
|
dst, RE_GuardList* src) {
|
|
if (dst->capacity < src->count) {
|
|
RE_GuardSpan* new_spans;
|
|
|
|
if (!safe_state)
|
|
return FALSE;
|
|
|
|
dst->capacity = src->count;
|
|
new_spans = (RE_GuardSpan*)safe_realloc(safe_state, dst->spans,
|
|
dst->capacity * sizeof(RE_GuardSpan));
|
|
if (!new_spans)
|
|
return FALSE;
|
|
|
|
dst->spans = new_spans;
|
|
}
|
|
|
|
dst->count = src->count;
|
|
memmove(dst->spans, src->spans, dst->count * sizeof(RE_GuardSpan));
|
|
|
|
dst->last_text_pos = -1;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Copies a repeat. */
|
|
Py_LOCAL_INLINE(BOOL) copy_repeat_data(RE_SafeState* safe_state, RE_RepeatData*
|
|
dst, RE_RepeatData* src) {
|
|
if (!copy_guard_data(safe_state, &dst->body_guard_list,
|
|
&src->body_guard_list) || !copy_guard_data(safe_state,
|
|
&dst->tail_guard_list, &src->tail_guard_list)) {
|
|
safe_dealloc(safe_state, dst->body_guard_list.spans);
|
|
safe_dealloc(safe_state, dst->tail_guard_list.spans);
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
dst->count = src->count;
|
|
dst->start = src->start;
|
|
dst->capture_change = src->capture_change;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Pushes a return node onto the group call stack. */
|
|
Py_LOCAL_INLINE(BOOL) push_group_return(RE_SafeState* safe_state, RE_Node*
|
|
return_node) {
|
|
RE_State* state;
|
|
PatternObject* pattern;
|
|
RE_GroupCallFrame* frame;
|
|
|
|
state = safe_state->re_state;
|
|
pattern = state->pattern;
|
|
|
|
if (state->current_group_call_frame &&
|
|
state->current_group_call_frame->next)
|
|
/* Advance to the next allocated frame. */
|
|
frame = state->current_group_call_frame->next;
|
|
else if (!state->current_group_call_frame && state->first_group_call_frame)
|
|
/* Advance to the first allocated frame. */
|
|
frame = state->first_group_call_frame;
|
|
else {
|
|
/* Create a new frame. */
|
|
frame = (RE_GroupCallFrame*)safe_alloc(safe_state,
|
|
sizeof(RE_GroupCallFrame));
|
|
if (!frame)
|
|
return FALSE;
|
|
|
|
frame->groups = (RE_GroupData*)safe_alloc(safe_state,
|
|
pattern->true_group_count * sizeof(RE_GroupData));
|
|
frame->repeats = (RE_RepeatData*)safe_alloc(safe_state,
|
|
pattern->repeat_count * sizeof(RE_RepeatData));
|
|
if (!frame->groups || !frame->repeats) {
|
|
safe_dealloc(safe_state, frame->groups);
|
|
safe_dealloc(safe_state, frame->repeats);
|
|
safe_dealloc(safe_state, frame);
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
memset(frame->groups, 0, pattern->true_group_count *
|
|
sizeof(RE_GroupData));
|
|
memset(frame->repeats, 0, pattern->repeat_count *
|
|
sizeof(RE_RepeatData));
|
|
|
|
frame->previous = state->current_group_call_frame;
|
|
frame->next = NULL;
|
|
|
|
if (frame->previous)
|
|
frame->previous->next = frame;
|
|
else
|
|
state->first_group_call_frame = frame;
|
|
}
|
|
|
|
frame->node = return_node;
|
|
|
|
/* Push the groups and guards. */
|
|
if (return_node) {
|
|
size_t g;
|
|
size_t r;
|
|
|
|
for (g = 0; g < pattern->true_group_count; g++) {
|
|
frame->groups[g].span = state->groups[g].span;
|
|
frame->groups[g].current_capture =
|
|
state->groups[g].current_capture;
|
|
}
|
|
|
|
for (r = 0; r < pattern->repeat_count; r++) {
|
|
if (!copy_repeat_data(safe_state, &frame->repeats[r],
|
|
&state->repeats[r]))
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
state->current_group_call_frame = frame;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Pops a return node from the group call stack. */
|
|
Py_LOCAL_INLINE(RE_Node*) pop_group_return(RE_State* state) {
|
|
RE_GroupCallFrame* frame;
|
|
|
|
frame = state->current_group_call_frame;
|
|
|
|
/* Pop the groups and repeats. */
|
|
if (frame->node) {
|
|
PatternObject* pattern;
|
|
size_t g;
|
|
size_t r;
|
|
|
|
pattern = state->pattern;
|
|
|
|
for (g = 0; g < pattern->true_group_count; g++) {
|
|
state->groups[g].span = frame->groups[g].span;
|
|
state->groups[g].current_capture =
|
|
frame->groups[g].current_capture;
|
|
}
|
|
|
|
for (r = 0; r < pattern->repeat_count; r++)
|
|
copy_repeat_data(NULL, &state->repeats[r], &frame->repeats[r]);
|
|
}
|
|
|
|
/* Withdraw to previous frame. */
|
|
state->current_group_call_frame = frame->previous;
|
|
|
|
return frame->node;
|
|
}
|
|
|
|
/* Returns the return node from the top of the group call stack. */
|
|
Py_LOCAL_INLINE(RE_Node*) top_group_return(RE_State* state) {
|
|
RE_GroupCallFrame* frame;
|
|
|
|
frame = state->current_group_call_frame;
|
|
|
|
return frame->node;
|
|
}
|
|
|
|
/* Checks whether a node matches only 1 character. */
|
|
Py_LOCAL_INLINE(BOOL) node_matches_one_character(RE_Node* node) {
|
|
switch (node->op) {
|
|
case RE_OP_ANY:
|
|
case RE_OP_ANY_ALL:
|
|
case RE_OP_ANY_ALL_REV:
|
|
case RE_OP_ANY_REV:
|
|
case RE_OP_ANY_U:
|
|
case RE_OP_ANY_U_REV:
|
|
case RE_OP_CHARACTER:
|
|
case RE_OP_CHARACTER_IGN:
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
case RE_OP_CHARACTER_REV:
|
|
case RE_OP_PROPERTY:
|
|
case RE_OP_PROPERTY_IGN:
|
|
case RE_OP_PROPERTY_IGN_REV:
|
|
case RE_OP_PROPERTY_REV:
|
|
case RE_OP_RANGE:
|
|
case RE_OP_RANGE_IGN:
|
|
case RE_OP_RANGE_IGN_REV:
|
|
case RE_OP_RANGE_REV:
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
case RE_OP_SET_DIFF_REV:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION:
|
|
case RE_OP_SET_UNION_IGN:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
return TRUE;
|
|
default:
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
/* Checks whether the node is a firstset. */
|
|
Py_LOCAL_INLINE(BOOL) is_firstset(RE_Node* node) {
|
|
if (node->step != 0)
|
|
return FALSE;
|
|
|
|
return node_matches_one_character(node);
|
|
}
|
|
|
|
/* Locates the start node for testing ahead. */
|
|
Py_LOCAL_INLINE(RE_Node*) locate_test_start(RE_Node* node) {
|
|
for (;;) {
|
|
switch (node->op) {
|
|
case RE_OP_BOUNDARY:
|
|
switch (node->next_1.node->op) {
|
|
case RE_OP_STRING:
|
|
case RE_OP_STRING_FLD:
|
|
case RE_OP_STRING_FLD_REV:
|
|
case RE_OP_STRING_IGN:
|
|
case RE_OP_STRING_IGN_REV:
|
|
case RE_OP_STRING_REV:
|
|
return node->next_1.node;
|
|
default:
|
|
return node;
|
|
}
|
|
case RE_OP_CALL_REF:
|
|
case RE_OP_END_GROUP:
|
|
case RE_OP_START_GROUP:
|
|
node = node->next_1.node;
|
|
break;
|
|
case RE_OP_GREEDY_REPEAT:
|
|
case RE_OP_LAZY_REPEAT:
|
|
if (node->values[1] == 0)
|
|
return node;
|
|
node = node->next_1.node;
|
|
break;
|
|
case RE_OP_GREEDY_REPEAT_ONE:
|
|
case RE_OP_LAZY_REPEAT_ONE:
|
|
if (node->values[1] == 0)
|
|
return node;
|
|
return node->nonstring.next_2.node;
|
|
case RE_OP_LOOKAROUND:
|
|
node = node->next_1.node;
|
|
break;
|
|
default:
|
|
if (is_firstset(node)) {
|
|
switch (node->next_1.node->op) {
|
|
case RE_OP_END_OF_STRING:
|
|
case RE_OP_START_OF_STRING:
|
|
return node->next_1.node;
|
|
}
|
|
}
|
|
|
|
return node;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Checks whether a character matches any of a set of case characters. */
|
|
Py_LOCAL_INLINE(BOOL) any_case(Py_UCS4 ch, int case_count, Py_UCS4* cases) {
|
|
int i;
|
|
|
|
for (i = 0; i < case_count; i++) {
|
|
if (ch == cases[i])
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Matches many ANYs, up to a limit. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_ANY(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_ANY(encoding, node, text_ptr[0])
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_ANY(encoding, node, text_ptr[0])
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_ANY(encoding, node, text_ptr[0])
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many ANYs, up to a limit, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_ANY_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_ANY(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_ANY(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_ANY(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many ANY_Us, up to a limit. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_ANY_U(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_ANY_U(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_ANY_U(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_ANY_U(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many ANY_Us, up to a limit, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_ANY_U_REV(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_ANY_U(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_ANY_U(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_ANY_U(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many CHARACTERs, up to a limit. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_CHARACTER(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
Py_UCS4 ch;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
ch = node->values[0];
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && (text_ptr[0] == ch) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && (text_ptr[0] == ch) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && (text_ptr[0] == ch) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many CHARACTERs, up to a limit, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_CHARACTER_IGN(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int case_count;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
case_count = state->encoding->all_cases(node->values[0], cases);
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && any_case(text_ptr[0], case_count, cases)
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && any_case(text_ptr[0], case_count, cases)
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && any_case(text_ptr[0], case_count, cases)
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many CHARACTERs, up to a limit, backwards, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_CHARACTER_IGN_REV(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int case_count;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
case_count = state->encoding->all_cases(node->values[0], cases);
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && any_case(text_ptr[-1], case_count,
|
|
cases) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && any_case(text_ptr[-1], case_count,
|
|
cases) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && any_case(text_ptr[-1], case_count,
|
|
cases) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many CHARACTERs, up to a limit, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_CHARACTER_REV(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
Py_UCS4 ch;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
ch = node->values[0];
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && (text_ptr[-1] == ch) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && (text_ptr[-1] == ch) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && (text_ptr[-1] == ch) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many PROPERTYs, up to a limit. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_PROPERTY(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_PROPERTY(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_PROPERTY(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many PROPERTYs, up to a limit, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_PROPERTY_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_PROPERTY_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_PROPERTY_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many PROPERTYs, up to a limit, backwards, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN_REV(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_PROPERTY_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_PROPERTY_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_PROPERTY_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many PROPERTYs, up to a limit, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_REV(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_PROPERTY(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_PROPERTY(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_PROPERTY(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many RANGEs, up to a limit. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_RANGE(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_RANGE(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_RANGE(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_RANGE(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many RANGEs, up to a limit, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_RANGE_IGN(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_RANGE_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_RANGE_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_RANGE_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many RANGEs, up to a limit, backwards, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_RANGE_IGN_REV(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_RANGE_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_RANGE_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_RANGE_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many RANGEs, up to a limit, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_RANGE_REV(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_RANGE(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_RANGE(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_RANGE(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many SETs, up to a limit. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_SET(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_SET(encoding, node, text_ptr[0])
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_SET(encoding, node, text_ptr[0])
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_SET(encoding, node, text_ptr[0])
|
|
== match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many SETs, up to a limit, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_SET_IGN(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_SET_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_SET_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr < limit_ptr && matches_SET_IGN(encoding, node,
|
|
text_ptr[0]) == match)
|
|
++text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many SETs, up to a limit, backwards, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_SET_IGN_REV(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_SET_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_SET_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_SET_IGN(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Matches many SETs, up to a limit, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_many_SET_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit, BOOL match) {
|
|
void* text;
|
|
RE_EncodingTable* encoding;
|
|
|
|
text = state->text;
|
|
match = node->match == match;
|
|
encoding = state->encoding;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_SET(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS1*)text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_SET(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS2*)text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr > limit_ptr && matches_SET(encoding, node,
|
|
text_ptr[-1]) == match)
|
|
--text_ptr;
|
|
|
|
text_pos = text_ptr - (Py_UCS4*)text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Counts a repeated character pattern. */
|
|
Py_LOCAL_INLINE(size_t) count_one(RE_State* state, RE_Node* node, Py_ssize_t
|
|
text_pos, size_t max_count, BOOL* is_partial) {
|
|
size_t count;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
if (max_count < 1)
|
|
return 0;
|
|
|
|
switch (node->op) {
|
|
case RE_OP_ANY:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_ANY(state, node, text_pos, text_pos +
|
|
(Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_ANY_ALL:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_ANY_ALL_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_ANY_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_ANY_REV(state, node, text_pos,
|
|
text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_ANY_U:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_ANY_U(state, node, text_pos, text_pos +
|
|
(Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_ANY_U_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_ANY_U_REV(state, node, text_pos,
|
|
text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_CHARACTER:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_CHARACTER(state, node, text_pos, text_pos +
|
|
(Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_CHARACTER_IGN:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_CHARACTER_IGN(state, node, text_pos,
|
|
text_pos + (Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_CHARACTER_IGN_REV(state, node,
|
|
text_pos, text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_CHARACTER_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_CHARACTER_REV(state, node,
|
|
text_pos, text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_PROPERTY:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_PROPERTY(state, node, text_pos, text_pos +
|
|
(Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_PROPERTY_IGN:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_PROPERTY_IGN(state, node, text_pos,
|
|
text_pos + (Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_PROPERTY_IGN_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_PROPERTY_IGN_REV(state, node,
|
|
text_pos, text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_PROPERTY_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_PROPERTY_REV(state, node,
|
|
text_pos, text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_RANGE:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_RANGE(state, node, text_pos, text_pos +
|
|
(Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_RANGE_IGN:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_RANGE_IGN(state, node, text_pos, text_pos +
|
|
(Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_RANGE_IGN_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_RANGE_IGN_REV(state, node,
|
|
text_pos, text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_RANGE_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_RANGE_REV(state, node, text_pos,
|
|
text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_UNION:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_SET(state, node, text_pos, text_pos +
|
|
(Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_UNION_IGN:
|
|
count = min_size_t((size_t)(state->slice_end - text_pos), max_count);
|
|
|
|
count = (size_t)(match_many_SET_IGN(state, node, text_pos, text_pos +
|
|
(Py_ssize_t)count, TRUE) - text_pos);
|
|
|
|
*is_partial = count == (size_t)(state->text_length - text_pos) && count
|
|
< max_count && state->partial_side == RE_PARTIAL_RIGHT;
|
|
|
|
return count;
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_SET_IGN_REV(state, node,
|
|
text_pos, text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
case RE_OP_SET_DIFF_REV:
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
count = min_size_t((size_t)(text_pos - state->slice_start), max_count);
|
|
|
|
count = (size_t)(text_pos - match_many_SET_REV(state, node, text_pos,
|
|
text_pos - (Py_ssize_t)count, TRUE));
|
|
|
|
*is_partial = count == (size_t)(text_pos) && count < max_count &&
|
|
state->partial_side == RE_PARTIAL_LEFT;
|
|
|
|
return count;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Performs a simple string search. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) simple_string_search(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) {
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
RE_EncodingTable* encoding;
|
|
Py_UCS4 check_char;
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
values = node->values;
|
|
encoding = state->encoding;
|
|
check_char = values[0];
|
|
|
|
*is_partial = FALSE;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text = (Py_UCS1*)state->text;
|
|
Py_UCS1* text_ptr = text + text_pos;
|
|
Py_UCS1* limit_ptr = text + limit;
|
|
|
|
while (text_ptr < limit_ptr) {
|
|
if (text_ptr[0] == check_char) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr + s_pos >= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char(encoding, text_ptr[s_pos], values[s_pos]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
++text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text = (Py_UCS2*)state->text;
|
|
Py_UCS2* text_ptr = text + text_pos;
|
|
Py_UCS2* limit_ptr = text + limit;
|
|
|
|
while (text_ptr < limit_ptr) {
|
|
if (text_ptr[0] == check_char) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr + s_pos >= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char(encoding, text_ptr[s_pos], values[s_pos]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
++text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text = (Py_UCS4*)state->text;
|
|
Py_UCS4* text_ptr = text + text_pos;
|
|
Py_UCS4* limit_ptr = text + limit;
|
|
|
|
while (text_ptr < limit_ptr) {
|
|
if (text_ptr[0] == check_char) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr + s_pos >= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char(encoding, text_ptr[s_pos], values[s_pos]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
++text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_pos;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Performs a simple string search, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) simple_string_search_ign(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) {
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
RE_EncodingTable* encoding;
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int case_count;
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
values = node->values;
|
|
encoding = state->encoding;
|
|
case_count = encoding->all_cases(values[0], cases);
|
|
|
|
*is_partial = FALSE;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text = (Py_UCS1*)state->text;
|
|
Py_UCS1* text_ptr = text + text_pos;
|
|
Py_UCS1* limit_ptr = text + limit;
|
|
|
|
while (text_ptr < limit_ptr) {
|
|
if (any_case(text_ptr[0], case_count, cases)) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr + s_pos >= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char_ign(encoding, text_ptr[s_pos],
|
|
values[s_pos]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
++text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text = (Py_UCS2*)state->text;
|
|
Py_UCS2* text_ptr = text + text_pos;
|
|
Py_UCS2* limit_ptr = text + limit;
|
|
|
|
while (text_ptr < limit_ptr) {
|
|
if (any_case(text_ptr[0], case_count, cases)) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr + s_pos >= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char_ign(encoding, text_ptr[s_pos],
|
|
values[s_pos]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
++text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text = (Py_UCS4*)state->text;
|
|
Py_UCS4* text_ptr = text + text_pos;
|
|
Py_UCS4* limit_ptr = text + limit;
|
|
|
|
while (text_ptr < limit_ptr) {
|
|
if (any_case(text_ptr[0], case_count, cases)) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr + s_pos >= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char_ign(encoding, text_ptr[s_pos],
|
|
values[s_pos]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
++text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_pos;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Performs a simple string search, backwards, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) simple_string_search_ign_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) {
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
RE_EncodingTable* encoding;
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int case_count;
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
values = node->values;
|
|
encoding = state->encoding;
|
|
case_count = encoding->all_cases(values[length - 1], cases);
|
|
|
|
*is_partial = FALSE;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text = (Py_UCS1*)state->text;
|
|
Py_UCS1* text_ptr = text + text_pos;
|
|
Py_UCS1* limit_ptr = text + limit;
|
|
|
|
while (text_ptr > limit_ptr) {
|
|
if (any_case(text_ptr[-1], case_count, cases)) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr - s_pos <= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char_ign(encoding, text_ptr[- s_pos - 1],
|
|
values[length - s_pos - 1]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
--text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text = (Py_UCS2*)state->text;
|
|
Py_UCS2* text_ptr = text + text_pos;
|
|
Py_UCS2* limit_ptr = text + limit;
|
|
|
|
while (text_ptr > limit_ptr) {
|
|
if (any_case(text_ptr[-1], case_count, cases)) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr - s_pos <= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char_ign(encoding, text_ptr[- s_pos - 1],
|
|
values[length - s_pos - 1]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
--text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text = (Py_UCS4*)state->text;
|
|
Py_UCS4* text_ptr = text + text_pos;
|
|
Py_UCS4* limit_ptr = text + limit;
|
|
|
|
while (text_ptr > limit_ptr) {
|
|
if (any_case(text_ptr[-1], case_count, cases)) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr - s_pos <= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char_ign(encoding, text_ptr[- s_pos - 1],
|
|
values[length - s_pos - 1]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
--text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_pos;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Performs a simple string search, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) simple_string_search_rev(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) {
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
RE_EncodingTable* encoding;
|
|
Py_UCS4 check_char;
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
values = node->values;
|
|
encoding = state->encoding;
|
|
check_char = values[length - 1];
|
|
|
|
*is_partial = FALSE;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text = (Py_UCS1*)state->text;
|
|
Py_UCS1* text_ptr = text + text_pos;
|
|
Py_UCS1* limit_ptr = text + limit;
|
|
|
|
while (text_ptr > limit_ptr) {
|
|
if (text_ptr[-1] == check_char) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr - s_pos <= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char(encoding, text_ptr[- s_pos - 1],
|
|
values[length - s_pos - 1]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
--text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text = (Py_UCS2*)state->text;
|
|
Py_UCS2* text_ptr = text + text_pos;
|
|
Py_UCS2* limit_ptr = text + limit;
|
|
|
|
while (text_ptr > limit_ptr) {
|
|
if (text_ptr[-1] == check_char) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr - s_pos <= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char(encoding, text_ptr[- s_pos - 1],
|
|
values[length - s_pos - 1]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
--text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text = (Py_UCS4*)state->text;
|
|
Py_UCS4* text_ptr = text + text_pos;
|
|
Py_UCS4* limit_ptr = text + limit;
|
|
|
|
while (text_ptr > limit_ptr) {
|
|
if (text_ptr[-1] == check_char) {
|
|
Py_ssize_t s_pos;
|
|
|
|
s_pos = 1;
|
|
|
|
for (;;) {
|
|
if (s_pos >= length)
|
|
/* End of search string. */
|
|
return text_ptr - text;
|
|
|
|
if (text_ptr - s_pos <= limit_ptr) {
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_ptr - text;
|
|
}
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
if (!same_char(encoding, text_ptr[- s_pos - 1],
|
|
values[length - s_pos - 1]))
|
|
break;
|
|
|
|
++s_pos;
|
|
}
|
|
}
|
|
|
|
--text_ptr;
|
|
}
|
|
text_pos = text_ptr - text;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Off the end of the text. */
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
/* Partial match. */
|
|
*is_partial = TRUE;
|
|
return text_pos;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Performs a Boyer-Moore fast string search. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) fast_string_search(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos, Py_ssize_t limit) {
|
|
RE_EncodingTable* encoding;
|
|
void* text;
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
Py_ssize_t* bad_character_offset;
|
|
Py_ssize_t* good_suffix_offset;
|
|
Py_ssize_t last_pos;
|
|
Py_UCS4 check_char;
|
|
|
|
encoding = state->encoding;
|
|
text = state->text;
|
|
length = (Py_ssize_t)node->value_count;
|
|
values = node->values;
|
|
good_suffix_offset = node->string.good_suffix_offset;
|
|
bad_character_offset = node->string.bad_character_offset;
|
|
last_pos = length - 1;
|
|
check_char = values[last_pos];
|
|
limit -= length;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr <= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[last_pos];
|
|
if (ch == check_char) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = last_pos - 1;
|
|
while (pos >= 0 && same_char(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
--pos;
|
|
|
|
if (pos < 0)
|
|
return text_ptr - (Py_UCS1*)text;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr <= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[last_pos];
|
|
if (ch == check_char) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = last_pos - 1;
|
|
while (pos >= 0 && same_char(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
--pos;
|
|
|
|
if (pos < 0)
|
|
return text_ptr - (Py_UCS2*)text;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr <= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[last_pos];
|
|
if (ch == check_char) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = last_pos - 1;
|
|
while (pos >= 0 && same_char(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
--pos;
|
|
|
|
if (pos < 0)
|
|
return text_ptr - (Py_UCS4*)text;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Performs a Boyer-Moore fast string search, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) fast_string_search_ign(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit) {
|
|
RE_EncodingTable* encoding;
|
|
void* text;
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
Py_ssize_t* bad_character_offset;
|
|
Py_ssize_t* good_suffix_offset;
|
|
Py_ssize_t last_pos;
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int case_count;
|
|
|
|
encoding = state->encoding;
|
|
text = state->text;
|
|
length = (Py_ssize_t)node->value_count;
|
|
values = node->values;
|
|
good_suffix_offset = node->string.good_suffix_offset;
|
|
bad_character_offset = node->string.bad_character_offset;
|
|
last_pos = length - 1;
|
|
case_count = encoding->all_cases(values[last_pos], cases);
|
|
limit -= length;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr <= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[last_pos];
|
|
if (any_case(ch, case_count, cases)) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = last_pos - 1;
|
|
while (pos >= 0 && same_char_ign(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
--pos;
|
|
|
|
if (pos < 0)
|
|
return text_ptr - (Py_UCS1*)text;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr <= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[last_pos];
|
|
if (any_case(ch, case_count, cases)) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = last_pos - 1;
|
|
while (pos >= 0 && same_char_ign(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
--pos;
|
|
|
|
if (pos < 0)
|
|
return text_ptr - (Py_UCS2*)text;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr <= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[last_pos];
|
|
if (any_case(ch, case_count, cases)) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = last_pos - 1;
|
|
while (pos >= 0 && same_char_ign(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
--pos;
|
|
|
|
if (pos < 0)
|
|
return text_ptr - (Py_UCS4*)text;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Performs a Boyer-Moore fast string search, backwards, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) fast_string_search_ign_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit) {
|
|
RE_EncodingTable* encoding;
|
|
void* text;
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
Py_ssize_t* bad_character_offset;
|
|
Py_ssize_t* good_suffix_offset;
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
int case_count;
|
|
|
|
encoding = state->encoding;
|
|
text = state->text;
|
|
length = (Py_ssize_t)node->value_count;
|
|
values = node->values;
|
|
good_suffix_offset = node->string.good_suffix_offset;
|
|
bad_character_offset = node->string.bad_character_offset;
|
|
case_count = encoding->all_cases(values[0], cases);
|
|
text_pos -= length;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr >= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[0];
|
|
if (any_case(ch, case_count, cases)) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = 1;
|
|
while (pos < length && same_char_ign(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
++pos;
|
|
|
|
if (pos >= length)
|
|
return text_ptr - (Py_UCS1*)text + length;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr >= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[0];
|
|
if (any_case(ch, case_count, cases)) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = 1;
|
|
while (pos < length && same_char_ign(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
++pos;
|
|
|
|
if (pos >= length)
|
|
return text_ptr - (Py_UCS2*)text + length;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr >= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[0];
|
|
if (any_case(ch, case_count, cases)) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = 1;
|
|
while (pos < length && same_char_ign(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
++pos;
|
|
|
|
if (pos >= length)
|
|
return text_ptr - (Py_UCS4*)text + length;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Performs a Boyer-Moore fast string search, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) fast_string_search_rev(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit) {
|
|
RE_EncodingTable* encoding;
|
|
void* text;
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
Py_ssize_t* bad_character_offset;
|
|
Py_ssize_t* good_suffix_offset;
|
|
Py_UCS4 check_char;
|
|
|
|
encoding = state->encoding;
|
|
text = state->text;
|
|
length = (Py_ssize_t)node->value_count;
|
|
values = node->values;
|
|
good_suffix_offset = node->string.good_suffix_offset;
|
|
bad_character_offset = node->string.bad_character_offset;
|
|
check_char = values[0];
|
|
text_pos -= length;
|
|
|
|
switch (state->charsize) {
|
|
case 1:
|
|
{
|
|
Py_UCS1* text_ptr;
|
|
Py_UCS1* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS1*)text + text_pos;
|
|
limit_ptr = (Py_UCS1*)text + limit;
|
|
|
|
while (text_ptr >= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[0];
|
|
if (ch == check_char) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = 1;
|
|
while (pos < length && same_char(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
++pos;
|
|
|
|
if (pos >= length)
|
|
return text_ptr - (Py_UCS1*)text + length;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
case 2:
|
|
{
|
|
Py_UCS2* text_ptr;
|
|
Py_UCS2* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS2*)text + text_pos;
|
|
limit_ptr = (Py_UCS2*)text + limit;
|
|
|
|
while (text_ptr >= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[0];
|
|
if (ch == check_char) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = 1;
|
|
while (pos < length && same_char(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
++pos;
|
|
|
|
if (pos >= length)
|
|
return text_ptr - (Py_UCS2*)text + length;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
case 4:
|
|
{
|
|
Py_UCS4* text_ptr;
|
|
Py_UCS4* limit_ptr;
|
|
|
|
text_ptr = (Py_UCS4*)text + text_pos;
|
|
limit_ptr = (Py_UCS4*)text + limit;
|
|
|
|
while (text_ptr >= limit_ptr) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = text_ptr[0];
|
|
if (ch == check_char) {
|
|
Py_ssize_t pos;
|
|
|
|
pos = 1;
|
|
while (pos < length && same_char(encoding,
|
|
text_ptr[pos], values[pos]))
|
|
++pos;
|
|
|
|
if (pos >= length)
|
|
return text_ptr - (Py_UCS4*)text + length;
|
|
|
|
text_ptr += good_suffix_offset[pos];
|
|
} else
|
|
text_ptr += bad_character_offset[ch & 0xFF];
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Builds the tables for a Boyer-Moore fast string search. */
|
|
Py_LOCAL_INLINE(BOOL) build_fast_tables(RE_EncodingTable* encoding, RE_Node*
|
|
node, BOOL ignore) {
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
Py_ssize_t* bad;
|
|
Py_ssize_t* good;
|
|
Py_UCS4 ch;
|
|
Py_ssize_t last_pos;
|
|
Py_ssize_t pos;
|
|
BOOL (*is_same_char)(RE_EncodingTable* encoding, Py_UCS4 ch1, Py_UCS4 ch2);
|
|
Py_ssize_t suffix_len;
|
|
BOOL saved_start;
|
|
Py_ssize_t s;
|
|
Py_ssize_t i;
|
|
Py_ssize_t s_start;
|
|
Py_UCS4 codepoints[RE_MAX_CASES];
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
|
|
if (length < RE_MIN_FAST_LENGTH)
|
|
return TRUE;
|
|
|
|
values = node->values;
|
|
bad = (Py_ssize_t*)re_alloc(256 * sizeof(bad[0]));
|
|
good = (Py_ssize_t*)re_alloc((size_t)length * sizeof(good[0]));
|
|
|
|
if (!bad || !good) {
|
|
re_dealloc(bad);
|
|
re_dealloc(good);
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
for (ch = 0; ch < 0x100; ch++)
|
|
bad[ch] = length;
|
|
|
|
last_pos = length - 1;
|
|
|
|
for (pos = 0; pos < last_pos; pos++) {
|
|
Py_ssize_t offset;
|
|
|
|
offset = last_pos - pos;
|
|
ch = values[pos];
|
|
if (ignore) {
|
|
int count;
|
|
int i;
|
|
|
|
count = encoding->all_cases(ch, codepoints);
|
|
|
|
for (i = 0; i < count; i++)
|
|
bad[codepoints[i] & 0xFF] = offset;
|
|
} else
|
|
bad[ch & 0xFF] = offset;
|
|
}
|
|
|
|
is_same_char = ignore ? same_char_ign_wrapper : same_char_wrapper;
|
|
|
|
suffix_len = 2;
|
|
pos = length - suffix_len;
|
|
saved_start = FALSE;
|
|
s = pos - 1;
|
|
i = suffix_len - 1;
|
|
s_start = s;
|
|
|
|
while (pos >= 0) {
|
|
/* Look for another occurrence of the suffix. */
|
|
while (i > 0) {
|
|
/* Have we dropped off the end of the string? */
|
|
if (s + i < 0)
|
|
break;
|
|
|
|
if (is_same_char(encoding, values[s + i], values[pos + i]))
|
|
/* It still matches. */
|
|
--i;
|
|
else {
|
|
/* Start again further along. */
|
|
--s;
|
|
i = suffix_len - 1;
|
|
}
|
|
}
|
|
|
|
if (s >= 0 && is_same_char(encoding, values[s], values[pos])) {
|
|
/* We haven't dropped off the end of the string, and the suffix has
|
|
* matched this far, so this is a good starting point for the next
|
|
* iteration.
|
|
*/
|
|
--s;
|
|
if (!saved_start) {
|
|
s_start = s;
|
|
saved_start = TRUE;
|
|
}
|
|
} else {
|
|
/* Calculate the suffix offset. */
|
|
good[pos] = pos - s;
|
|
|
|
/* Extend the suffix and start searching for _this_ one. */
|
|
--pos;
|
|
++suffix_len;
|
|
|
|
/* Where's a good place to start searching? */
|
|
if (saved_start) {
|
|
s = s_start;
|
|
saved_start = FALSE;
|
|
} else
|
|
--s;
|
|
|
|
/* Can we short-circuit the searching? */
|
|
if (s < 0)
|
|
break;
|
|
}
|
|
|
|
i = suffix_len - 1;
|
|
}
|
|
|
|
/* Fill-in any remaining entries. */
|
|
while (pos >= 0) {
|
|
good[pos] = pos - s;
|
|
--pos;
|
|
--s;
|
|
}
|
|
|
|
node->string.bad_character_offset = bad;
|
|
node->string.good_suffix_offset = good;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Builds the tables for a Boyer-Moore fast string search, backwards. */
|
|
Py_LOCAL_INLINE(BOOL) build_fast_tables_rev(RE_EncodingTable* encoding,
|
|
RE_Node* node, BOOL ignore) {
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
Py_ssize_t* bad;
|
|
Py_ssize_t* good;
|
|
Py_UCS4 ch;
|
|
Py_ssize_t last_pos;
|
|
Py_ssize_t pos;
|
|
BOOL (*is_same_char)(RE_EncodingTable* encoding, Py_UCS4 ch1, Py_UCS4 ch2);
|
|
Py_ssize_t suffix_len;
|
|
BOOL saved_start;
|
|
Py_ssize_t s;
|
|
Py_ssize_t i;
|
|
Py_ssize_t s_start;
|
|
Py_UCS4 codepoints[RE_MAX_CASES];
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
|
|
if (length < RE_MIN_FAST_LENGTH)
|
|
return TRUE;
|
|
|
|
values = node->values;
|
|
bad = (Py_ssize_t*)re_alloc(256 * sizeof(bad[0]));
|
|
good = (Py_ssize_t*)re_alloc((size_t)length * sizeof(good[0]));
|
|
|
|
if (!bad || !good) {
|
|
re_dealloc(bad);
|
|
re_dealloc(good);
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
for (ch = 0; ch < 0x100; ch++)
|
|
bad[ch] = -length;
|
|
|
|
last_pos = length - 1;
|
|
|
|
for (pos = last_pos; pos > 0; pos--) {
|
|
Py_ssize_t offset;
|
|
|
|
offset = -pos;
|
|
ch = values[pos];
|
|
if (ignore) {
|
|
int count;
|
|
int i;
|
|
|
|
count = encoding->all_cases(ch, codepoints);
|
|
|
|
for (i = 0; i < count; i++)
|
|
bad[codepoints[i] & 0xFF] = offset;
|
|
} else
|
|
bad[ch & 0xFF] = offset;
|
|
}
|
|
|
|
is_same_char = ignore ? same_char_ign_wrapper : same_char_wrapper;
|
|
|
|
suffix_len = 2;
|
|
pos = suffix_len - 1;
|
|
saved_start = FALSE;
|
|
s = pos + 1;
|
|
i = suffix_len - 1;
|
|
s_start = s;
|
|
|
|
while (pos < length) {
|
|
/* Look for another occurrence of the suffix. */
|
|
while (i > 0) {
|
|
/* Have we dropped off the end of the string? */
|
|
if (s - i >= length)
|
|
break;
|
|
|
|
if (is_same_char(encoding, values[s - i], values[pos - i]))
|
|
/* It still matches. */
|
|
--i;
|
|
else {
|
|
/* Start again further along. */
|
|
++s;
|
|
i = suffix_len - 1;
|
|
}
|
|
}
|
|
|
|
if (s < length && is_same_char(encoding, values[s], values[pos])) {
|
|
/* We haven't dropped off the end of the string, and the suffix has
|
|
* matched this far, so this is a good starting point for the next
|
|
* iteration.
|
|
*/
|
|
++s;
|
|
if (!saved_start) {
|
|
s_start = s;
|
|
saved_start = TRUE;
|
|
}
|
|
} else {
|
|
/* Calculate the suffix offset. */
|
|
good[pos] = pos - s;
|
|
|
|
/* Extend the suffix and start searching for _this_ one. */
|
|
++pos;
|
|
++suffix_len;
|
|
|
|
/* Where's a good place to start searching? */
|
|
if (saved_start) {
|
|
s = s_start;
|
|
saved_start = FALSE;
|
|
} else
|
|
++s;
|
|
|
|
/* Can we short-circuit the searching? */
|
|
if (s >= length)
|
|
break;
|
|
}
|
|
|
|
i = suffix_len - 1;
|
|
}
|
|
|
|
/* Fill-in any remaining entries. */
|
|
while (pos < length) {
|
|
good[pos] = pos - s;
|
|
++pos;
|
|
++s;
|
|
}
|
|
|
|
node->string.bad_character_offset = bad;
|
|
node->string.good_suffix_offset = good;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Performs a string search. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) string_search(RE_SafeState* safe_state, RE_Node*
|
|
node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) {
|
|
RE_State* state;
|
|
Py_ssize_t found_pos;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
/* Has the node been initialised for fast searching, if necessary? */
|
|
if (!(node->status & RE_STATUS_FAST_INIT)) {
|
|
/* Ideally the pattern should immutable and shareable across threads.
|
|
* Internally, however, it isn't. For safety we need to hold the GIL.
|
|
*/
|
|
acquire_GIL(safe_state);
|
|
|
|
/* Double-check because of multithreading. */
|
|
if (!(node->status & RE_STATUS_FAST_INIT)) {
|
|
build_fast_tables(state->encoding, node, FALSE);
|
|
node->status |= RE_STATUS_FAST_INIT;
|
|
}
|
|
|
|
release_GIL(safe_state);
|
|
}
|
|
|
|
if (node->string.bad_character_offset) {
|
|
/* Start with a fast search. This will find the string if it's complete
|
|
* (i.e. not truncated).
|
|
*/
|
|
found_pos = fast_string_search(state, node, text_pos, limit);
|
|
if (found_pos < 0 && state->partial_side == RE_PARTIAL_RIGHT)
|
|
/* We didn't find the string, but it could've been truncated, so
|
|
* try again, starting close to the end.
|
|
*/
|
|
found_pos = simple_string_search(state, node, limit -
|
|
(Py_ssize_t)(node->value_count - 1), limit, is_partial);
|
|
} else
|
|
found_pos = simple_string_search(state, node, text_pos, limit,
|
|
is_partial);
|
|
|
|
return found_pos;
|
|
}
|
|
|
|
/* Performs a string search, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) string_search_fld(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, Py_ssize_t* new_pos,
|
|
BOOL* is_partial) {
|
|
RE_State* state;
|
|
RE_EncodingTable* encoding;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
void* text;
|
|
RE_CODE* values;
|
|
Py_ssize_t start_pos;
|
|
int f_pos;
|
|
int folded_len;
|
|
Py_ssize_t length;
|
|
Py_ssize_t s_pos;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
|
|
state = safe_state->re_state;
|
|
encoding = state->encoding;
|
|
full_case_fold = encoding->full_case_fold;
|
|
char_at = state->char_at;
|
|
text = state->text;
|
|
|
|
values = node->values;
|
|
start_pos = text_pos;
|
|
f_pos = 0;
|
|
folded_len = 0;
|
|
length = (Py_ssize_t)node->value_count;
|
|
s_pos = 0;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
while (s_pos < length || f_pos < folded_len) {
|
|
if (f_pos >= folded_len) {
|
|
/* Fetch and casefold another character. */
|
|
if (text_pos >= limit) {
|
|
if (text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT) {
|
|
*is_partial = TRUE;
|
|
return start_pos;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
folded_len = full_case_fold(char_at(text, text_pos), folded);
|
|
f_pos = 0;
|
|
}
|
|
|
|
if (same_char_ign(encoding, values[s_pos], folded[f_pos])) {
|
|
++s_pos;
|
|
++f_pos;
|
|
|
|
if (f_pos >= folded_len)
|
|
++text_pos;
|
|
} else {
|
|
++start_pos;
|
|
text_pos = start_pos;
|
|
f_pos = 0;
|
|
folded_len = 0;
|
|
s_pos = 0;
|
|
}
|
|
}
|
|
|
|
/* We found the string. */
|
|
if (new_pos)
|
|
*new_pos = text_pos;
|
|
|
|
return start_pos;
|
|
}
|
|
|
|
/* Performs a string search, backwards, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) string_search_fld_rev(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, Py_ssize_t* new_pos,
|
|
BOOL* is_partial) {
|
|
RE_State* state;
|
|
RE_EncodingTable* encoding;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
void* text;
|
|
RE_CODE* values;
|
|
Py_ssize_t start_pos;
|
|
int f_pos;
|
|
int folded_len;
|
|
Py_ssize_t length;
|
|
Py_ssize_t s_pos;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
|
|
state = safe_state->re_state;
|
|
encoding = state->encoding;
|
|
full_case_fold = encoding->full_case_fold;
|
|
char_at = state->char_at;
|
|
text = state->text;
|
|
|
|
values = node->values;
|
|
start_pos = text_pos;
|
|
f_pos = 0;
|
|
folded_len = 0;
|
|
length = (Py_ssize_t)node->value_count;
|
|
s_pos = 0;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
while (s_pos < length || f_pos < folded_len) {
|
|
if (f_pos >= folded_len) {
|
|
/* Fetch and casefold another character. */
|
|
if (text_pos <= limit) {
|
|
if (text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT) {
|
|
*is_partial = TRUE;
|
|
return start_pos;
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
folded_len = full_case_fold(char_at(text, text_pos - 1), folded);
|
|
f_pos = 0;
|
|
}
|
|
|
|
if (same_char_ign(encoding, values[length - s_pos - 1],
|
|
folded[folded_len - f_pos - 1])) {
|
|
++s_pos;
|
|
++f_pos;
|
|
|
|
if (f_pos >= folded_len)
|
|
--text_pos;
|
|
} else {
|
|
--start_pos;
|
|
text_pos = start_pos;
|
|
f_pos = 0;
|
|
folded_len = 0;
|
|
s_pos = 0;
|
|
}
|
|
}
|
|
|
|
/* We found the string. */
|
|
if (new_pos)
|
|
*new_pos = text_pos;
|
|
|
|
return start_pos;
|
|
}
|
|
|
|
/* Performs a string search, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) string_search_ign(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) {
|
|
RE_State* state;
|
|
Py_ssize_t found_pos;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
/* Has the node been initialised for fast searching, if necessary? */
|
|
if (!(node->status & RE_STATUS_FAST_INIT)) {
|
|
/* Ideally the pattern should immutable and shareable across threads.
|
|
* Internally, however, it isn't. For safety we need to hold the GIL.
|
|
*/
|
|
acquire_GIL(safe_state);
|
|
|
|
/* Double-check because of multithreading. */
|
|
if (!(node->status & RE_STATUS_FAST_INIT)) {
|
|
build_fast_tables(state->encoding, node, TRUE);
|
|
node->status |= RE_STATUS_FAST_INIT;
|
|
}
|
|
|
|
release_GIL(safe_state);
|
|
}
|
|
|
|
if (node->string.bad_character_offset) {
|
|
/* Start with a fast search. This will find the string if it's complete
|
|
* (i.e. not truncated).
|
|
*/
|
|
found_pos = fast_string_search_ign(state, node, text_pos, limit);
|
|
if (found_pos < 0 && state->partial_side == RE_PARTIAL_RIGHT)
|
|
/* We didn't find the string, but it could've been truncated, so
|
|
* try again, starting close to the end.
|
|
*/
|
|
found_pos = simple_string_search_ign(state, node, limit -
|
|
(Py_ssize_t)(node->value_count - 1), limit, is_partial);
|
|
} else
|
|
found_pos = simple_string_search_ign(state, node, text_pos, limit,
|
|
is_partial);
|
|
|
|
return found_pos;
|
|
}
|
|
|
|
/* Performs a string search, backwards, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) string_search_ign_rev(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) {
|
|
RE_State* state;
|
|
Py_ssize_t found_pos;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
/* Has the node been initialised for fast searching, if necessary? */
|
|
if (!(node->status & RE_STATUS_FAST_INIT)) {
|
|
/* Ideally the pattern should immutable and shareable across threads.
|
|
* Internally, however, it isn't. For safety we need to hold the GIL.
|
|
*/
|
|
acquire_GIL(safe_state);
|
|
|
|
/* Double-check because of multithreading. */
|
|
if (!(node->status & RE_STATUS_FAST_INIT)) {
|
|
build_fast_tables_rev(state->encoding, node, TRUE);
|
|
node->status |= RE_STATUS_FAST_INIT;
|
|
}
|
|
|
|
release_GIL(safe_state);
|
|
}
|
|
|
|
if (node->string.bad_character_offset) {
|
|
/* Start with a fast search. This will find the string if it's complete
|
|
* (i.e. not truncated).
|
|
*/
|
|
found_pos = fast_string_search_ign_rev(state, node, text_pos, limit);
|
|
if (found_pos < 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
/* We didn't find the string, but it could've been truncated, so
|
|
* try again, starting close to the end.
|
|
*/
|
|
found_pos = simple_string_search_ign_rev(state, node, limit +
|
|
(Py_ssize_t)(node->value_count - 1), limit, is_partial);
|
|
} else
|
|
found_pos = simple_string_search_ign_rev(state, node, text_pos, limit,
|
|
is_partial);
|
|
|
|
return found_pos;
|
|
}
|
|
|
|
/* Performs a string search, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) string_search_rev(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t limit, BOOL* is_partial) {
|
|
RE_State* state;
|
|
Py_ssize_t found_pos;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
/* Has the node been initialised for fast searching, if necessary? */
|
|
if (!(node->status & RE_STATUS_FAST_INIT)) {
|
|
/* Ideally the pattern should immutable and shareable across threads.
|
|
* Internally, however, it isn't. For safety we need to hold the GIL.
|
|
*/
|
|
acquire_GIL(safe_state);
|
|
|
|
/* Double-check because of multithreading. */
|
|
if (!(node->status & RE_STATUS_FAST_INIT)) {
|
|
build_fast_tables_rev(state->encoding, node, FALSE);
|
|
node->status |= RE_STATUS_FAST_INIT;
|
|
}
|
|
|
|
release_GIL(safe_state);
|
|
}
|
|
|
|
if (node->string.bad_character_offset) {
|
|
/* Start with a fast search. This will find the string if it's complete
|
|
* (i.e. not truncated).
|
|
*/
|
|
found_pos = fast_string_search_rev(state, node, text_pos, limit);
|
|
if (found_pos < 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
/* We didn't find the string, but it could've been truncated, so
|
|
* try again, starting close to the end.
|
|
*/
|
|
found_pos = simple_string_search_rev(state, node, limit +
|
|
(Py_ssize_t)(node->value_count - 1), limit, is_partial);
|
|
} else
|
|
found_pos = simple_string_search_rev(state, node, text_pos, limit,
|
|
is_partial);
|
|
|
|
return found_pos;
|
|
}
|
|
|
|
/* Returns how many characters there could be before full case-folding. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) possible_unfolded_length(Py_ssize_t length) {
|
|
if (length == 0)
|
|
return 0;
|
|
|
|
if (length < RE_MAX_FOLDED)
|
|
return 1;
|
|
|
|
return length / RE_MAX_FOLDED;
|
|
}
|
|
|
|
/* Checks whether there's any character except a newline at a position. */
|
|
Py_LOCAL_INLINE(int) try_match_ANY(RE_State* state, RE_Node* node, Py_ssize_t
|
|
text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_ANY(state->encoding, node, state->char_at(state->text,
|
|
text_pos)));
|
|
}
|
|
|
|
/* Checks whether there's any character at all at a position. */
|
|
Py_LOCAL_INLINE(int) try_match_ANY_ALL(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end);
|
|
}
|
|
|
|
/* Checks whether there's any character at all at a position, backwards. */
|
|
Py_LOCAL_INLINE(int) try_match_ANY_ALL_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start);
|
|
}
|
|
|
|
/* Checks whether there's any character except a newline at a position,
|
|
* backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_ANY_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_ANY(state->encoding, node, state->char_at(state->text, text_pos -
|
|
1)));
|
|
}
|
|
|
|
/* Checks whether there's any character except a line separator at a position.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_ANY_U(RE_State* state, RE_Node* node, Py_ssize_t
|
|
text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_ANY_U(state->encoding, node, state->char_at(state->text,
|
|
text_pos)));
|
|
}
|
|
|
|
/* Checks whether there's any character except a line separator at a position,
|
|
* backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_ANY_U_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_ANY_U(state->encoding, node, state->char_at(state->text, text_pos
|
|
- 1)));
|
|
}
|
|
|
|
/* Checks whether a position is on a word boundary. */
|
|
Py_LOCAL_INLINE(int) try_match_BOUNDARY(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_boundary(state, text_pos) ==
|
|
node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character at a position. */
|
|
Py_LOCAL_INLINE(int) try_match_CHARACTER(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_CHARACTER(state->encoding, node, state->char_at(state->text,
|
|
text_pos)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character at a position, ignoring case. */
|
|
Py_LOCAL_INLINE(int) try_match_CHARACTER_IGN(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_CHARACTER_IGN(state->encoding, node, state->char_at(state->text,
|
|
text_pos)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character at a position, ignoring case, backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_CHARACTER_IGN_REV(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_CHARACTER_IGN(state->encoding, node, state->char_at(state->text,
|
|
text_pos - 1)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character at a position, backwards. */
|
|
Py_LOCAL_INLINE(int) try_match_CHARACTER_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_CHARACTER(state->encoding, node, state->char_at(state->text,
|
|
text_pos - 1)) == node->match);
|
|
}
|
|
|
|
/* Checks whether a position is on a default word boundary. */
|
|
Py_LOCAL_INLINE(int) try_match_DEFAULT_BOUNDARY(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_default_boundary(state, text_pos)
|
|
== node->match);
|
|
}
|
|
|
|
/* Checks whether a position is at the default end of a word. */
|
|
Py_LOCAL_INLINE(int) try_match_DEFAULT_END_OF_WORD(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_default_word_end(state,
|
|
text_pos));
|
|
}
|
|
|
|
/* Checks whether a position is at the default start of a word. */
|
|
Py_LOCAL_INLINE(int) try_match_DEFAULT_START_OF_WORD(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_default_word_start(state,
|
|
text_pos));
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a line. */
|
|
Py_LOCAL_INLINE(int) try_match_END_OF_LINE(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(text_pos >= state->slice_end ||
|
|
state->char_at(state->text, text_pos) == '\n');
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a line. */
|
|
Py_LOCAL_INLINE(int) try_match_END_OF_LINE_U(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_line_end(state, text_pos));
|
|
}
|
|
|
|
/* Checks whether a position is at the end of the string. */
|
|
Py_LOCAL_INLINE(int) try_match_END_OF_STRING(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(text_pos >= state->text_length);
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a line or the string. */
|
|
Py_LOCAL_INLINE(int) try_match_END_OF_STRING_LINE(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos) {
|
|
return bool_as_status(text_pos >= state->text_length || text_pos ==
|
|
state->final_newline);
|
|
}
|
|
|
|
/* Checks whether a position is at the end of the string. */
|
|
Py_LOCAL_INLINE(int) try_match_END_OF_STRING_LINE_U(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos) {
|
|
return bool_as_status(text_pos >= state->text_length || text_pos ==
|
|
state->final_line_sep);
|
|
}
|
|
|
|
/* Checks whether a position is at the end of a word. */
|
|
Py_LOCAL_INLINE(int) try_match_END_OF_WORD(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_word_end(state, text_pos));
|
|
}
|
|
|
|
/* Checks whether a position is on a grapheme boundary. */
|
|
Py_LOCAL_INLINE(int) try_match_GRAPHEME_BOUNDARY(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_grapheme_boundary(state,
|
|
text_pos));
|
|
}
|
|
|
|
/* Checks whether there's a character with a certain property at a position. */
|
|
Py_LOCAL_INLINE(int) try_match_PROPERTY(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_PROPERTY(state->encoding, node, state->char_at(state->text,
|
|
text_pos)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character with a certain property at a position,
|
|
* ignoring case.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_PROPERTY_IGN(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_PROPERTY_IGN(state->encoding, node, state->char_at(state->text,
|
|
text_pos)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character with a certain property at a position,
|
|
* ignoring case, backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_PROPERTY_IGN_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_PROPERTY_IGN(state->encoding, node, state->char_at(state->text,
|
|
text_pos - 1)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character with a certain property at a position,
|
|
* backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_PROPERTY_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_PROPERTY(state->encoding, node, state->char_at(state->text,
|
|
text_pos - 1)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character in a certain range at a position. */
|
|
Py_LOCAL_INLINE(int) try_match_RANGE(RE_State* state, RE_Node* node, Py_ssize_t
|
|
text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_RANGE(state->encoding, node, state->char_at(state->text,
|
|
text_pos)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character in a certain range at a position,
|
|
* ignoring case.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_RANGE_IGN(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_RANGE_IGN(state->encoding, node, state->char_at(state->text,
|
|
text_pos)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character in a certain range at a position,
|
|
* ignoring case, backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_RANGE_IGN_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_RANGE_IGN(state->encoding, node, state->char_at(state->text,
|
|
text_pos - 1)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character in a certain range at a position,
|
|
* backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_RANGE_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_RANGE(state->encoding, node, state->char_at(state->text, text_pos
|
|
- 1)) == node->match);
|
|
}
|
|
|
|
/* Checks whether a position is at the search anchor. */
|
|
Py_LOCAL_INLINE(int) try_match_SEARCH_ANCHOR(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(text_pos == state->search_anchor);
|
|
}
|
|
|
|
/* Checks whether there's a character in a certain set at a position. */
|
|
Py_LOCAL_INLINE(int) try_match_SET(RE_State* state, RE_Node* node, Py_ssize_t
|
|
text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_SET(state->encoding, node, state->char_at(state->text, text_pos))
|
|
== node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character in a certain set at a position, ignoring
|
|
* case.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_SET_IGN(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos < state->slice_end &&
|
|
matches_SET_IGN(state->encoding, node, state->char_at(state->text,
|
|
text_pos)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character in a certain set at a position, ignoring
|
|
* case, backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_SET_IGN_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_SET_IGN(state->encoding, node, state->char_at(state->text,
|
|
text_pos - 1)) == node->match);
|
|
}
|
|
|
|
/* Checks whether there's a character in a certain set at a position,
|
|
* backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_SET_REV(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
if (text_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
return bool_as_status(text_pos > state->slice_start &&
|
|
matches_SET(state->encoding, node, state->char_at(state->text, text_pos -
|
|
1)) == node->match);
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a line. */
|
|
Py_LOCAL_INLINE(int) try_match_START_OF_LINE(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(text_pos <= 0 || state->char_at(state->text, text_pos
|
|
- 1) == '\n');
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a line. */
|
|
Py_LOCAL_INLINE(int) try_match_START_OF_LINE_U(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_line_start(state, text_pos));
|
|
}
|
|
|
|
/* Checks whether a position is at the start of the string. */
|
|
Py_LOCAL_INLINE(int) try_match_START_OF_STRING(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(text_pos <= 0);
|
|
}
|
|
|
|
/* Checks whether a position is at the start of a word. */
|
|
Py_LOCAL_INLINE(int) try_match_START_OF_WORD(RE_State* state, RE_Node* node,
|
|
Py_ssize_t text_pos) {
|
|
return bool_as_status(state->encoding->at_word_start(state, text_pos));
|
|
}
|
|
|
|
/* Checks whether there's a certain string at a position. */
|
|
Py_LOCAL_INLINE(int) try_match_STRING(RE_State* state, RE_NextNode* next,
|
|
RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) {
|
|
Py_ssize_t length;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
RE_EncodingTable* encoding;
|
|
RE_CODE* values;
|
|
Py_ssize_t s_pos;
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
char_at = state->char_at;
|
|
encoding = state->encoding;
|
|
values = node->values;
|
|
|
|
for (s_pos = 0; s_pos < length; s_pos++) {
|
|
if (text_pos + s_pos >= state->slice_end) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
if (!same_char(encoding, char_at(state->text, text_pos + s_pos),
|
|
values[s_pos]))
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
next_position->node = next->match_next;
|
|
next_position->text_pos = text_pos + next->match_step;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Checks whether there's a certain string at a position, ignoring case. */
|
|
Py_LOCAL_INLINE(int) try_match_STRING_FLD(RE_State* state, RE_NextNode* next,
|
|
RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) {
|
|
Py_ssize_t length;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
RE_EncodingTable* encoding;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
Py_ssize_t s_pos;
|
|
RE_CODE* values;
|
|
int folded_len;
|
|
int f_pos;
|
|
Py_ssize_t start_pos;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
char_at = state->char_at;
|
|
encoding = state->encoding;
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
s_pos = 0;
|
|
values = node->values;
|
|
folded_len = 0;
|
|
f_pos = 0;
|
|
start_pos = text_pos;
|
|
|
|
while (s_pos < length) {
|
|
if (f_pos >= folded_len) {
|
|
/* Fetch and casefold another character. */
|
|
if (text_pos >= state->slice_end) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
folded_len = full_case_fold(char_at(state->text, text_pos),
|
|
folded);
|
|
f_pos = 0;
|
|
}
|
|
|
|
if (!same_char_ign(encoding, folded[f_pos], values[s_pos]))
|
|
return RE_ERROR_FAILURE;
|
|
|
|
++s_pos;
|
|
++f_pos;
|
|
|
|
if (f_pos >= folded_len)
|
|
++text_pos;
|
|
}
|
|
|
|
if (f_pos < folded_len)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
next_position->node = next->match_next;
|
|
if (next->match_step == 0)
|
|
next_position->text_pos = start_pos;
|
|
else
|
|
next_position->text_pos = text_pos;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Checks whether there's a certain string at a position, ignoring case,
|
|
* backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_STRING_FLD_REV(RE_State* state, RE_NextNode*
|
|
next, RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) {
|
|
Py_ssize_t length;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
RE_EncodingTable* encoding;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
Py_ssize_t s_pos;
|
|
RE_CODE* values;
|
|
int folded_len;
|
|
int f_pos;
|
|
Py_ssize_t start_pos;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
char_at = state->char_at;
|
|
encoding = state->encoding;
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
s_pos = 0;
|
|
values = node->values;
|
|
folded_len = 0;
|
|
f_pos = 0;
|
|
start_pos = text_pos;
|
|
|
|
while (s_pos < length) {
|
|
if (f_pos >= folded_len) {
|
|
/* Fetch and casefold another character. */
|
|
if (text_pos <= state->slice_start) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
folded_len = full_case_fold(char_at(state->text, text_pos - 1),
|
|
folded);
|
|
f_pos = 0;
|
|
}
|
|
|
|
if (!same_char_ign(encoding, folded[folded_len - f_pos - 1],
|
|
values[length - s_pos - 1]))
|
|
return RE_ERROR_FAILURE;
|
|
|
|
++s_pos;
|
|
++f_pos;
|
|
|
|
if (f_pos >= folded_len)
|
|
--text_pos;
|
|
}
|
|
|
|
if (f_pos < folded_len)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
next_position->node = next->match_next;
|
|
if (next->match_step == 0)
|
|
next_position->text_pos = start_pos;
|
|
else
|
|
next_position->text_pos = text_pos;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Checks whether there's a certain string at a position, ignoring case. */
|
|
Py_LOCAL_INLINE(int) try_match_STRING_IGN(RE_State* state, RE_NextNode* next,
|
|
RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) {
|
|
Py_ssize_t length;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
RE_EncodingTable* encoding;
|
|
RE_CODE* values;
|
|
Py_ssize_t s_pos;
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
char_at = state->char_at;
|
|
encoding = state->encoding;
|
|
values = node->values;
|
|
|
|
for (s_pos = 0; s_pos < length; s_pos++) {
|
|
if (text_pos + s_pos >= state->slice_end) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
if (!same_char_ign(encoding, char_at(state->text, text_pos + s_pos),
|
|
values[s_pos]))
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
next_position->node = next->match_next;
|
|
next_position->text_pos = text_pos + next->match_step;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Checks whether there's a certain string at a position, ignoring case,
|
|
* backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match_STRING_IGN_REV(RE_State* state, RE_NextNode*
|
|
next, RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) {
|
|
Py_ssize_t length;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
RE_EncodingTable* encoding;
|
|
RE_CODE* values;
|
|
Py_ssize_t s_pos;
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
char_at = state->char_at;
|
|
encoding = state->encoding;
|
|
values = node->values;
|
|
|
|
for (s_pos = 0; s_pos < length; s_pos++) {
|
|
if (text_pos - s_pos <= state->slice_start) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
if (!same_char_ign(encoding, char_at(state->text, text_pos - s_pos -
|
|
1), values[length - s_pos - 1]))
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
next_position->node = next->match_next;
|
|
next_position->text_pos = text_pos + next->match_step;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Checks whether there's a certain string at a position, backwards. */
|
|
Py_LOCAL_INLINE(int) try_match_STRING_REV(RE_State* state, RE_NextNode* next,
|
|
RE_Node* node, Py_ssize_t text_pos, RE_Position* next_position) {
|
|
Py_ssize_t length;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
RE_EncodingTable* encoding;
|
|
RE_CODE* values;
|
|
Py_ssize_t s_pos;
|
|
|
|
length = (Py_ssize_t)node->value_count;
|
|
char_at = state->char_at;
|
|
encoding = state->encoding;
|
|
values = node->values;
|
|
|
|
for (s_pos = 0; s_pos < length; s_pos++) {
|
|
if (text_pos - s_pos <= state->slice_start) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
if (!same_char(encoding, char_at(state->text, text_pos - s_pos - 1),
|
|
values[length - s_pos - 1]))
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
next_position->node = next->match_next;
|
|
next_position->text_pos = text_pos + next->match_step;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Tries a match at the current text position.
|
|
*
|
|
* Returns the next node and text position if the match succeeds.
|
|
*/
|
|
Py_LOCAL_INLINE(int) try_match(RE_State* state, RE_NextNode* next, Py_ssize_t
|
|
text_pos, RE_Position* next_position) {
|
|
RE_Node* test;
|
|
int status;
|
|
|
|
test = next->test;
|
|
|
|
if (test->status & RE_STATUS_FUZZY) {
|
|
next_position->node = next->node;
|
|
next_position->text_pos = text_pos;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
switch (test->op) {
|
|
case RE_OP_ANY:
|
|
status = try_match_ANY(state, test, text_pos);
|
|
break;
|
|
case RE_OP_ANY_ALL:
|
|
status = try_match_ANY_ALL(state, test, text_pos);
|
|
break;
|
|
case RE_OP_ANY_ALL_REV:
|
|
status = try_match_ANY_ALL_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_ANY_REV:
|
|
status = try_match_ANY_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_ANY_U:
|
|
status = try_match_ANY_U(state, test, text_pos);
|
|
break;
|
|
case RE_OP_ANY_U_REV:
|
|
status = try_match_ANY_U_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_BOUNDARY:
|
|
status = try_match_BOUNDARY(state, test, text_pos);
|
|
break;
|
|
case RE_OP_BRANCH:
|
|
status = try_match(state, &test->next_1, text_pos, next_position);
|
|
if (status == RE_ERROR_FAILURE)
|
|
status = try_match(state, &test->nonstring.next_2, text_pos,
|
|
next_position);
|
|
break;
|
|
case RE_OP_CHARACTER:
|
|
status = try_match_CHARACTER(state, test, text_pos);
|
|
break;
|
|
case RE_OP_CHARACTER_IGN:
|
|
status = try_match_CHARACTER_IGN(state, test, text_pos);
|
|
break;
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
status = try_match_CHARACTER_IGN_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_CHARACTER_REV:
|
|
status = try_match_CHARACTER_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_DEFAULT_BOUNDARY:
|
|
status = try_match_DEFAULT_BOUNDARY(state, test, text_pos);
|
|
break;
|
|
case RE_OP_DEFAULT_END_OF_WORD:
|
|
status = try_match_DEFAULT_END_OF_WORD(state, test, text_pos);
|
|
break;
|
|
case RE_OP_DEFAULT_START_OF_WORD:
|
|
status = try_match_DEFAULT_START_OF_WORD(state, test, text_pos);
|
|
break;
|
|
case RE_OP_END_OF_LINE:
|
|
status = try_match_END_OF_LINE(state, test, text_pos);
|
|
break;
|
|
case RE_OP_END_OF_LINE_U:
|
|
status = try_match_END_OF_LINE_U(state, test, text_pos);
|
|
break;
|
|
case RE_OP_END_OF_STRING:
|
|
status = try_match_END_OF_STRING(state, test, text_pos);
|
|
break;
|
|
case RE_OP_END_OF_STRING_LINE:
|
|
status = try_match_END_OF_STRING_LINE(state, test, text_pos);
|
|
break;
|
|
case RE_OP_END_OF_STRING_LINE_U:
|
|
status = try_match_END_OF_STRING_LINE_U(state, test, text_pos);
|
|
break;
|
|
case RE_OP_END_OF_WORD:
|
|
status = try_match_END_OF_WORD(state, test, text_pos);
|
|
break;
|
|
case RE_OP_GRAPHEME_BOUNDARY:
|
|
status = try_match_GRAPHEME_BOUNDARY(state, test, text_pos);
|
|
break;
|
|
case RE_OP_PROPERTY:
|
|
status = try_match_PROPERTY(state, test, text_pos);
|
|
break;
|
|
case RE_OP_PROPERTY_IGN:
|
|
status = try_match_PROPERTY_IGN(state, test, text_pos);
|
|
break;
|
|
case RE_OP_PROPERTY_IGN_REV:
|
|
status = try_match_PROPERTY_IGN_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_PROPERTY_REV:
|
|
status = try_match_PROPERTY_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_RANGE:
|
|
status = try_match_RANGE(state, test, text_pos);
|
|
break;
|
|
case RE_OP_RANGE_IGN:
|
|
status = try_match_RANGE_IGN(state, test, text_pos);
|
|
break;
|
|
case RE_OP_RANGE_IGN_REV:
|
|
status = try_match_RANGE_IGN_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_RANGE_REV:
|
|
status = try_match_RANGE_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_SEARCH_ANCHOR:
|
|
status = try_match_SEARCH_ANCHOR(state, test, text_pos);
|
|
break;
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_UNION:
|
|
status = try_match_SET(state, test, text_pos);
|
|
break;
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_UNION_IGN:
|
|
status = try_match_SET_IGN(state, test, text_pos);
|
|
break;
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
status = try_match_SET_IGN_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_SET_DIFF_REV:
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
status = try_match_SET_REV(state, test, text_pos);
|
|
break;
|
|
case RE_OP_START_OF_LINE:
|
|
status = try_match_START_OF_LINE(state, test, text_pos);
|
|
break;
|
|
case RE_OP_START_OF_LINE_U:
|
|
status = try_match_START_OF_LINE_U(state, test, text_pos);
|
|
break;
|
|
case RE_OP_START_OF_STRING:
|
|
status = try_match_START_OF_STRING(state, test, text_pos);
|
|
break;
|
|
case RE_OP_START_OF_WORD:
|
|
status = try_match_START_OF_WORD(state, test, text_pos);
|
|
break;
|
|
case RE_OP_STRING:
|
|
return try_match_STRING(state, next, test, text_pos, next_position);
|
|
case RE_OP_STRING_FLD:
|
|
return try_match_STRING_FLD(state, next, test, text_pos,
|
|
next_position);
|
|
case RE_OP_STRING_FLD_REV:
|
|
return try_match_STRING_FLD_REV(state, next, test, text_pos,
|
|
next_position);
|
|
case RE_OP_STRING_IGN:
|
|
return try_match_STRING_IGN(state, next, test, text_pos,
|
|
next_position);
|
|
case RE_OP_STRING_IGN_REV:
|
|
return try_match_STRING_IGN_REV(state, next, test, text_pos,
|
|
next_position);
|
|
case RE_OP_STRING_REV:
|
|
return try_match_STRING_REV(state, next, test, text_pos,
|
|
next_position);
|
|
default:
|
|
next_position->node = next->node;
|
|
next_position->text_pos = text_pos;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
next_position->node = next->match_next;
|
|
next_position->text_pos = text_pos + next->match_step;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Searches for a word boundary. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_boundary = state->encoding->at_boundary;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_boundary(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for a word boundary, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY_rev(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_boundary = state->encoding->at_boundary;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_boundary(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for a default word boundary. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_BOUNDARY(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_default_boundary)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_default_boundary = state->encoding->at_default_boundary;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_default_boundary(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for a default word boundary, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_BOUNDARY_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_default_boundary)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_default_boundary = state->encoding->at_default_boundary;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_default_boundary(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the default end of a word. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_END_OF_WORD(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_default_word_end)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_default_word_end = state->encoding->at_default_word_end;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_default_word_end(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the default end of a word, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_END_OF_WORD_rev(RE_State*
|
|
state, RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_default_word_end)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_default_word_end = state->encoding->at_default_word_end;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_default_word_end(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the default start of a word. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_START_OF_WORD(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_default_word_start)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_default_word_start = state->encoding->at_default_word_start;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_default_word_start(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the default start of a word, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_DEFAULT_START_OF_WORD_rev(RE_State*
|
|
state, RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_default_word_start)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_default_word_start = state->encoding->at_default_word_start;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_default_word_start(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the end of line. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_LINE(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (text_pos >= state->text_length || state->char_at(state->text,
|
|
text_pos) == '\n')
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the end of line, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_LINE_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (text_pos >= state->text_length || state->char_at(state->text,
|
|
text_pos) == '\n')
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the end of the string. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_STRING(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
if (state->slice_end >= state->text_length)
|
|
return state->text_length;
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Searches for the end of the string, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_STRING_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
if (text_pos >= state->text_length)
|
|
return text_pos;
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Searches for the end of the string or line. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_STRING_LINE(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
if (text_pos <= state->final_newline)
|
|
text_pos = state->final_newline;
|
|
else if (text_pos <= state->text_length)
|
|
text_pos = state->text_length;
|
|
|
|
if (text_pos > state->slice_end)
|
|
return -1;
|
|
|
|
if (text_pos >= state->text_length)
|
|
return text_pos;
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Searches for the end of the string or line, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_STRING_LINE_rev(RE_State*
|
|
state, RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
if (text_pos >= state->text_length)
|
|
text_pos = state->text_length;
|
|
else if (text_pos >= state->final_newline)
|
|
text_pos = state->final_newline;
|
|
else
|
|
return -1;
|
|
|
|
if (text_pos < state->slice_start)
|
|
return -1;
|
|
|
|
if (text_pos <= 0)
|
|
return text_pos;
|
|
|
|
return text_pos;
|
|
}
|
|
|
|
/* Searches for the end of a word. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_WORD(RE_State* state, RE_Node*
|
|
node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_word_end)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_word_end = state->encoding->at_word_end;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_word_end(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the end of a word, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_END_OF_WORD_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_word_end)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_word_end = state->encoding->at_word_end;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_word_end(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for a grapheme boundary. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_GRAPHEME_BOUNDARY(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_grapheme_boundary)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_grapheme_boundary = state->encoding->at_grapheme_boundary;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_grapheme_boundary(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for a grapheme boundary, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_GRAPHEME_BOUNDARY_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_grapheme_boundary)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_grapheme_boundary = state->encoding->at_grapheme_boundary;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_grapheme_boundary(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the start of line. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_LINE(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (text_pos <= 0 || state->char_at(state->text, text_pos - 1) == '\n')
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the start of line, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_LINE_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (text_pos <= 0 || state->char_at(state->text, text_pos - 1) == '\n')
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the start of the string. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_STRING(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
if (text_pos <= 0)
|
|
return text_pos;
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Searches for the start of the string, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_STRING_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
*is_partial = FALSE;
|
|
|
|
if (state->slice_start <= 0)
|
|
return 0;
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* Searches for the start of a word. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_WORD(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_word_start)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_word_start = state->encoding->at_word_start;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_word_start(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos >= state->slice_end)
|
|
return -1;
|
|
|
|
++text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for the start of a word, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_START_OF_WORD_rev(RE_State* state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
BOOL (*at_word_start)(RE_State* state, Py_ssize_t text_pos);
|
|
|
|
at_word_start = state->encoding->at_word_start;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
for (;;) {
|
|
if (at_word_start(state, text_pos) == node->match)
|
|
return text_pos;
|
|
|
|
if (text_pos <= state->slice_start)
|
|
return -1;
|
|
|
|
--text_pos;
|
|
}
|
|
}
|
|
|
|
/* Searches for a string. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
RE_State* state;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos)
|
|
return text_pos;
|
|
|
|
return string_search(safe_state, node, text_pos, state->slice_end,
|
|
is_partial);
|
|
}
|
|
|
|
/* Searches for a string, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_FLD(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, Py_ssize_t* new_pos, BOOL* is_partial) {
|
|
RE_State* state;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos) {
|
|
*new_pos = state->req_end;
|
|
return text_pos;
|
|
}
|
|
|
|
return string_search_fld(safe_state, node, text_pos, state->slice_end,
|
|
new_pos, is_partial);
|
|
}
|
|
|
|
/* Searches for a string, ignoring case, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_FLD_REV(RE_SafeState*
|
|
safe_state, RE_Node* node, Py_ssize_t text_pos, Py_ssize_t* new_pos, BOOL*
|
|
is_partial) {
|
|
RE_State* state;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos) {
|
|
*new_pos = state->req_end;
|
|
return text_pos;
|
|
}
|
|
|
|
return string_search_fld_rev(safe_state, node, text_pos,
|
|
state->slice_start, new_pos, is_partial);
|
|
}
|
|
|
|
/* Searches for a string, ignoring case. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_IGN(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
RE_State* state;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos)
|
|
return text_pos;
|
|
|
|
return string_search_ign(safe_state, node, text_pos, state->slice_end,
|
|
is_partial);
|
|
}
|
|
|
|
/* Searches for a string, ignoring case, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_IGN_REV(RE_SafeState*
|
|
safe_state, RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
RE_State* state;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos)
|
|
return text_pos;
|
|
|
|
return string_search_ign_rev(safe_state, node, text_pos,
|
|
state->slice_start, is_partial);
|
|
}
|
|
|
|
/* Searches for a string, backwards. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) search_start_STRING_REV(RE_SafeState* safe_state,
|
|
RE_Node* node, Py_ssize_t text_pos, BOOL* is_partial) {
|
|
RE_State* state;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
*is_partial = FALSE;
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && text_pos == state->req_pos)
|
|
return text_pos;
|
|
|
|
return string_search_rev(safe_state, node, text_pos, state->slice_start,
|
|
is_partial);
|
|
}
|
|
|
|
/* Searches for the start of a match. */
|
|
Py_LOCAL_INLINE(int) search_start(RE_SafeState* safe_state, RE_NextNode* next,
|
|
RE_Position* new_position, int search_index) {
|
|
RE_State* state;
|
|
Py_ssize_t text_pos;
|
|
RE_Node* test;
|
|
RE_Node* node;
|
|
Py_ssize_t start_pos;
|
|
RE_SearchPosition* info;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
start_pos = state->text_pos;
|
|
TRACE(("<<search_start>> at %d\n", start_pos))
|
|
|
|
test = next->test;
|
|
node = next->node;
|
|
|
|
if (state->reverse) {
|
|
if (start_pos < state->slice_start) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = state->slice_start;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
} else {
|
|
if (start_pos > state->slice_end) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = state->slice_end;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (test->status & RE_STATUS_FUZZY) {
|
|
/* Don't call 'search_start' again. */
|
|
state->pattern->do_search_start = FALSE;
|
|
|
|
state->match_pos = start_pos;
|
|
new_position->node = node;
|
|
new_position->text_pos = start_pos;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
again:
|
|
if (!state->pattern->is_fuzzy && state->partial_side == RE_PARTIAL_NONE) {
|
|
if (state->reverse) {
|
|
if (start_pos - state->min_width < state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
} else {
|
|
if (start_pos + state->min_width > state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
}
|
|
|
|
if (search_index < MAX_SEARCH_POSITIONS) {
|
|
info = &state->search_positions[search_index];
|
|
if (state->reverse) {
|
|
if (info->start_pos >= 0 && info->start_pos >= start_pos &&
|
|
start_pos >= info->match_pos) {
|
|
state->match_pos = info->match_pos;
|
|
|
|
new_position->text_pos = state->match_pos;
|
|
new_position->node = node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
} else {
|
|
if (info->start_pos >= 0 && info->start_pos <= start_pos &&
|
|
start_pos <= info->match_pos) {
|
|
state->match_pos = info->match_pos;
|
|
|
|
new_position->text_pos = state->match_pos;
|
|
new_position->node = node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
}
|
|
} else
|
|
info = NULL;
|
|
|
|
switch (test->op) {
|
|
case RE_OP_ANY:
|
|
start_pos = match_many_ANY(state, test, start_pos, state->slice_end,
|
|
FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_ANY_ALL:
|
|
break;
|
|
case RE_OP_ANY_ALL_REV:
|
|
break;
|
|
case RE_OP_ANY_REV:
|
|
start_pos = match_many_ANY_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_ANY_U:
|
|
start_pos = match_many_ANY_U(state, test, start_pos, state->slice_end,
|
|
FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_ANY_U_REV:
|
|
start_pos = match_many_ANY_U_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_BOUNDARY:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_BOUNDARY_rev(state, test, start_pos,
|
|
&is_partial);
|
|
else
|
|
start_pos = search_start_BOUNDARY(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_CHARACTER:
|
|
start_pos = match_many_CHARACTER(state, test, start_pos,
|
|
state->slice_end, FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_CHARACTER_IGN:
|
|
start_pos = match_many_CHARACTER_IGN(state, test, start_pos,
|
|
state->slice_end, FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
start_pos = match_many_CHARACTER_IGN_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_CHARACTER_REV:
|
|
start_pos = match_many_CHARACTER_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_DEFAULT_BOUNDARY:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_DEFAULT_BOUNDARY_rev(state, test,
|
|
start_pos, &is_partial);
|
|
else
|
|
start_pos = search_start_DEFAULT_BOUNDARY(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_DEFAULT_END_OF_WORD:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_DEFAULT_END_OF_WORD_rev(state, test,
|
|
start_pos, &is_partial);
|
|
else
|
|
start_pos = search_start_DEFAULT_END_OF_WORD(state, test,
|
|
start_pos, &is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_DEFAULT_START_OF_WORD:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_DEFAULT_START_OF_WORD_rev(state, test,
|
|
start_pos, &is_partial);
|
|
else
|
|
start_pos = search_start_DEFAULT_START_OF_WORD(state, test,
|
|
start_pos, &is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_END_OF_LINE:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_END_OF_LINE_rev(state, test, start_pos,
|
|
&is_partial);
|
|
else
|
|
start_pos = search_start_END_OF_LINE(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_END_OF_STRING:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_END_OF_STRING_rev(state, test, start_pos,
|
|
&is_partial);
|
|
else
|
|
start_pos = search_start_END_OF_STRING(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_END_OF_STRING_LINE:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_END_OF_STRING_LINE_rev(state, test,
|
|
start_pos, &is_partial);
|
|
else
|
|
start_pos = search_start_END_OF_STRING_LINE(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_END_OF_WORD:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_END_OF_WORD_rev(state, test, start_pos,
|
|
&is_partial);
|
|
else
|
|
start_pos = search_start_END_OF_WORD(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_GRAPHEME_BOUNDARY:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_GRAPHEME_BOUNDARY_rev(state, test,
|
|
start_pos, &is_partial);
|
|
else
|
|
start_pos = search_start_GRAPHEME_BOUNDARY(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_PROPERTY:
|
|
start_pos = match_many_PROPERTY(state, test, start_pos,
|
|
state->slice_end, FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_PROPERTY_IGN:
|
|
start_pos = match_many_PROPERTY_IGN(state, test, start_pos,
|
|
state->slice_end, FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_PROPERTY_IGN_REV:
|
|
start_pos = match_many_PROPERTY_IGN_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_PROPERTY_REV:
|
|
start_pos = match_many_PROPERTY_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_RANGE:
|
|
start_pos = match_many_RANGE(state, test, start_pos, state->slice_end,
|
|
FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_RANGE_IGN:
|
|
start_pos = match_many_RANGE_IGN(state, test, start_pos,
|
|
state->slice_end, FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_RANGE_IGN_REV:
|
|
start_pos = match_many_RANGE_IGN_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_RANGE_REV:
|
|
start_pos = match_many_RANGE_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return RE_ERROR_FAILURE;
|
|
break;
|
|
case RE_OP_SEARCH_ANCHOR:
|
|
if (state->reverse) {
|
|
if (start_pos < state->search_anchor)
|
|
return RE_ERROR_FAILURE;
|
|
} else {
|
|
if (start_pos > state->search_anchor)
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
start_pos = state->search_anchor;
|
|
break;
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_UNION:
|
|
start_pos = match_many_SET(state, test, start_pos, state->slice_end,
|
|
FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return FALSE;
|
|
break;
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_UNION_IGN:
|
|
start_pos = match_many_SET_IGN(state, test, start_pos,
|
|
state->slice_end, FALSE);
|
|
|
|
if (start_pos >= state->text_length) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos >= state->slice_end)
|
|
return FALSE;
|
|
break;
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
start_pos = match_many_SET_IGN_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return FALSE;
|
|
break;
|
|
case RE_OP_SET_DIFF_REV:
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
start_pos = match_many_SET_REV(state, test, start_pos,
|
|
state->slice_start, FALSE);
|
|
|
|
if (start_pos <= 0) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
}
|
|
|
|
if (start_pos <= state->slice_start)
|
|
return FALSE;
|
|
break;
|
|
case RE_OP_START_OF_LINE:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_START_OF_LINE_rev(state, test, start_pos,
|
|
&is_partial);
|
|
else
|
|
start_pos = search_start_START_OF_LINE(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_START_OF_STRING:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_START_OF_STRING_rev(state, test,
|
|
start_pos, &is_partial);
|
|
else
|
|
start_pos = search_start_START_OF_STRING(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_START_OF_WORD:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
if (state->reverse)
|
|
start_pos = search_start_START_OF_WORD_rev(state, test, start_pos,
|
|
&is_partial);
|
|
else
|
|
start_pos = search_start_START_OF_WORD(state, test, start_pos,
|
|
&is_partial);
|
|
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
start_pos = search_start_STRING(safe_state, test, start_pos,
|
|
&is_partial);
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD:
|
|
{
|
|
Py_ssize_t new_pos;
|
|
BOOL is_partial;
|
|
|
|
start_pos = search_start_STRING_FLD(safe_state, test, start_pos,
|
|
&new_pos, &is_partial);
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
/* Can we look further ahead? */
|
|
if (test == node) {
|
|
if (test->next_1.node) {
|
|
int status;
|
|
|
|
status = try_match(state, &test->next_1, new_pos,
|
|
new_position);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE) {
|
|
++start_pos;
|
|
|
|
if (start_pos >= state->slice_end) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = state->slice_start;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
goto again;
|
|
}
|
|
}
|
|
|
|
/* It's a possible match. */
|
|
state->match_pos = start_pos;
|
|
|
|
if (info) {
|
|
info->start_pos = state->text_pos;
|
|
info->match_pos = state->match_pos;
|
|
}
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD_REV:
|
|
{
|
|
Py_ssize_t new_pos;
|
|
BOOL is_partial;
|
|
|
|
start_pos = search_start_STRING_FLD_REV(safe_state, test, start_pos,
|
|
&new_pos, &is_partial);
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
/* Can we look further ahead? */
|
|
if (test == node) {
|
|
if (test->next_1.node) {
|
|
int status;
|
|
|
|
status = try_match(state, &test->next_1, new_pos,
|
|
new_position);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE) {
|
|
--start_pos;
|
|
|
|
if (start_pos <= state->slice_start) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = state->slice_start;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
goto again;
|
|
}
|
|
}
|
|
|
|
/* It's a possible match. */
|
|
state->match_pos = start_pos;
|
|
|
|
if (info) {
|
|
info->start_pos = state->text_pos;
|
|
info->match_pos = state->match_pos;
|
|
}
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
start_pos = search_start_STRING_IGN(safe_state, test, start_pos,
|
|
&is_partial);
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN_REV:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
start_pos = search_start_STRING_IGN_REV(safe_state, test, start_pos,
|
|
&is_partial);
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_REV:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
start_pos = search_start_STRING_REV(safe_state, test, start_pos,
|
|
&is_partial);
|
|
if (start_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
if (is_partial) {
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
/* Don't call 'search_start' again. */
|
|
state->pattern->do_search_start = FALSE;
|
|
|
|
state->match_pos = start_pos;
|
|
new_position->node = node;
|
|
new_position->text_pos = start_pos;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Can we look further ahead? */
|
|
if (test == node) {
|
|
text_pos = start_pos + test->step;
|
|
|
|
if (test->next_1.node) {
|
|
int status;
|
|
|
|
status = try_match(state, &test->next_1, text_pos, new_position);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE) {
|
|
if (state->reverse) {
|
|
--start_pos;
|
|
|
|
if (start_pos < state->slice_start) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT) {
|
|
new_position->text_pos = state->slice_start;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
} else {
|
|
++start_pos;
|
|
|
|
if (start_pos > state->slice_end) {
|
|
if (state->partial_side == RE_PARTIAL_RIGHT) {
|
|
new_position->text_pos = state->slice_end;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
}
|
|
|
|
goto again;
|
|
}
|
|
}
|
|
} else {
|
|
new_position->node = node;
|
|
new_position->text_pos = start_pos;
|
|
}
|
|
|
|
/* It's a possible match. */
|
|
state->match_pos = start_pos;
|
|
|
|
if (info) {
|
|
info->start_pos = state->text_pos;
|
|
info->match_pos = state->match_pos;
|
|
}
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Saves a capture group. */
|
|
Py_LOCAL_INLINE(BOOL) save_capture(RE_SafeState* safe_state, size_t
|
|
private_index, size_t public_index) {
|
|
RE_State* state;
|
|
RE_GroupData* private_group;
|
|
RE_GroupData* public_group;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
private_group = &state->groups[private_index - 1];
|
|
public_group = &state->groups[public_index - 1];
|
|
|
|
/* Will the repeated captures ever be visible? */
|
|
if (!state->visible_captures) {
|
|
public_group->captures[0] = private_group->span;
|
|
public_group->capture_count = 1;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
if (public_group->capture_count >= public_group->capture_capacity) {
|
|
size_t new_capacity;
|
|
RE_GroupSpan* new_captures;
|
|
|
|
new_capacity = public_group->capture_capacity * 2;
|
|
new_capacity = max_size_t(new_capacity, RE_INIT_CAPTURE_SIZE);
|
|
new_captures = (RE_GroupSpan*)safe_realloc(safe_state,
|
|
public_group->captures, new_capacity * sizeof(RE_GroupSpan));
|
|
if (!new_captures)
|
|
return FALSE;
|
|
|
|
public_group->captures = new_captures;
|
|
public_group->capture_capacity = new_capacity;
|
|
}
|
|
|
|
public_group->captures[public_group->capture_count++] =
|
|
private_group->span;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Unsaves a capture group. */
|
|
Py_LOCAL_INLINE(void) unsave_capture(RE_State* state, size_t private_index,
|
|
size_t public_index) {
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
if (state->groups[public_index - 1].capture_count > 0)
|
|
--state->groups[public_index - 1].capture_count;
|
|
}
|
|
|
|
/* Pushes the groups for backtracking. */
|
|
Py_LOCAL_INLINE(BOOL) push_groups(RE_SafeState* safe_state) {
|
|
RE_State* state;
|
|
size_t group_count;
|
|
RE_SavedGroups* current;
|
|
size_t g;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
group_count = state->pattern->true_group_count;
|
|
if (group_count == 0)
|
|
return TRUE;
|
|
|
|
current = state->current_saved_groups;
|
|
|
|
if (current && current->next)
|
|
current = current->next;
|
|
else if (!current && state->first_saved_groups)
|
|
current = state->first_saved_groups;
|
|
else {
|
|
RE_SavedGroups* new_block;
|
|
|
|
new_block = (RE_SavedGroups*)safe_alloc(safe_state,
|
|
sizeof(RE_SavedGroups));
|
|
if (!new_block)
|
|
return FALSE;
|
|
|
|
new_block->spans = (RE_GroupSpan*)safe_alloc(safe_state, group_count *
|
|
sizeof(RE_GroupSpan));
|
|
new_block->counts = (size_t*)safe_alloc(safe_state, group_count *
|
|
sizeof(Py_ssize_t));
|
|
if (!new_block->spans || !new_block->counts) {
|
|
safe_dealloc(safe_state, new_block->spans);
|
|
safe_dealloc(safe_state, new_block->counts);
|
|
safe_dealloc(safe_state, new_block);
|
|
return FALSE;
|
|
}
|
|
|
|
new_block->previous = current;
|
|
new_block->next = NULL;
|
|
|
|
if (new_block->previous)
|
|
new_block->previous->next = new_block;
|
|
else
|
|
state->first_saved_groups = new_block;
|
|
|
|
current = new_block;
|
|
}
|
|
|
|
for (g = 0; g < group_count; g++) {
|
|
current->spans[g] = state->groups[g].span;
|
|
current->counts[g] = state->groups[g].capture_count;
|
|
}
|
|
|
|
state->current_saved_groups = current;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Pops the groups for backtracking. */
|
|
Py_LOCAL_INLINE(void) pop_groups(RE_State* state) {
|
|
size_t group_count;
|
|
RE_SavedGroups* current;
|
|
size_t g;
|
|
|
|
group_count = state->pattern->true_group_count;
|
|
if (group_count == 0)
|
|
return;
|
|
|
|
current = state->current_saved_groups;
|
|
|
|
for (g = 0; g < group_count; g++) {
|
|
state->groups[g].span = current->spans[g];
|
|
state->groups[g].capture_count = current->counts[g];
|
|
}
|
|
|
|
state->current_saved_groups = current->previous;
|
|
}
|
|
|
|
/* Drops the groups for backtracking. */
|
|
Py_LOCAL_INLINE(void) drop_groups(RE_State* state) {
|
|
if (state->pattern->true_group_count != 0)
|
|
state->current_saved_groups = state->current_saved_groups->previous;
|
|
}
|
|
|
|
/* Pushes the repeats for backtracking. */
|
|
Py_LOCAL_INLINE(BOOL) push_repeats(RE_SafeState* safe_state) {
|
|
RE_State* state;
|
|
PatternObject* pattern;
|
|
size_t repeat_count;
|
|
RE_SavedRepeats* current;
|
|
size_t r;
|
|
|
|
state = safe_state->re_state;
|
|
pattern = state->pattern;
|
|
|
|
repeat_count = pattern->repeat_count;
|
|
if (repeat_count == 0)
|
|
return TRUE;
|
|
|
|
current = state->current_saved_repeats;
|
|
|
|
if (current && current->next)
|
|
current = current->next;
|
|
else if (!current && state->first_saved_repeats)
|
|
current = state->first_saved_repeats;
|
|
else {
|
|
RE_SavedRepeats* new_block;
|
|
|
|
new_block = (RE_SavedRepeats*)safe_alloc(safe_state,
|
|
sizeof(RE_SavedRepeats));
|
|
if (!new_block)
|
|
return FALSE;
|
|
|
|
memset(new_block, 0, sizeof(RE_SavedRepeats));
|
|
|
|
new_block->repeats = (RE_RepeatData*)safe_alloc(safe_state,
|
|
repeat_count * sizeof(RE_RepeatData));
|
|
if (!new_block->repeats) {
|
|
safe_dealloc(safe_state, new_block);
|
|
return FALSE;
|
|
}
|
|
|
|
memset(new_block->repeats, 0, repeat_count * sizeof(RE_RepeatData));
|
|
|
|
new_block->previous = current;
|
|
new_block->next = NULL;
|
|
|
|
if (new_block->previous)
|
|
new_block->previous->next = new_block;
|
|
else
|
|
state->first_saved_repeats = new_block;
|
|
|
|
current = new_block;
|
|
}
|
|
|
|
for (r = 0; r < repeat_count; r++) {
|
|
if (!copy_repeat_data(safe_state, ¤t->repeats[r],
|
|
&state->repeats[r]))
|
|
return FALSE;
|
|
}
|
|
|
|
state->current_saved_repeats = current;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Pops the repeats for backtracking. */
|
|
Py_LOCAL_INLINE(void) pop_repeats(RE_State* state) {
|
|
PatternObject* pattern;
|
|
size_t repeat_count;
|
|
RE_SavedRepeats* current;
|
|
size_t r;
|
|
|
|
pattern = state->pattern;
|
|
|
|
repeat_count = pattern->repeat_count;
|
|
if (repeat_count == 0)
|
|
return;
|
|
|
|
current = state->current_saved_repeats;
|
|
|
|
for (r = 0; r < repeat_count; r++)
|
|
copy_repeat_data(NULL, &state->repeats[r], ¤t->repeats[r]);
|
|
|
|
state->current_saved_repeats = current->previous;
|
|
}
|
|
|
|
/* Saves state info before a recusive call by 'basic_match'. */
|
|
Py_LOCAL_INLINE(void) save_info(RE_State* state, RE_Info* info) {
|
|
info->backtrack_count = state->current_backtrack_block->count;
|
|
info->current_backtrack_block = state->current_backtrack_block;
|
|
info->current_saved_groups = state->current_saved_groups;
|
|
info->must_advance = state->must_advance;
|
|
info->current_group_call_frame = state->current_group_call_frame;
|
|
}
|
|
|
|
/* Restores state info after a recusive call by 'basic_match'. */
|
|
Py_LOCAL_INLINE(void) restore_info(RE_State* state, RE_Info* info) {
|
|
state->current_group_call_frame = info->current_group_call_frame;
|
|
state->must_advance = info->must_advance;
|
|
state->current_saved_groups = info->current_saved_groups;
|
|
state->current_backtrack_block = info->current_backtrack_block;
|
|
state->current_backtrack_block->count = info->backtrack_count;
|
|
}
|
|
|
|
/* Inserts a new span in a guard list. */
|
|
Py_LOCAL_INLINE(BOOL) insert_guard_span(RE_SafeState* safe_state, RE_GuardList*
|
|
guard_list, size_t index) {
|
|
size_t n;
|
|
|
|
if (guard_list->count >= guard_list->capacity) {
|
|
size_t new_capacity;
|
|
RE_GuardSpan* new_spans;
|
|
|
|
new_capacity = guard_list->capacity * 2;
|
|
if (new_capacity == 0)
|
|
new_capacity = RE_INIT_GUARDS_BLOCK_SIZE;
|
|
new_spans = (RE_GuardSpan*)safe_realloc(safe_state, guard_list->spans,
|
|
new_capacity * sizeof(RE_GuardSpan));
|
|
if (!new_spans)
|
|
return FALSE;
|
|
|
|
guard_list->capacity = new_capacity;
|
|
guard_list->spans = new_spans;
|
|
}
|
|
|
|
n = guard_list->count - index;
|
|
if (n > 0)
|
|
memmove(guard_list->spans + index + 1, guard_list->spans + index, n *
|
|
sizeof(RE_GuardSpan));
|
|
++guard_list->count;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Deletes a span in a guard list. */
|
|
Py_LOCAL_INLINE(void) delete_guard_span(RE_GuardList* guard_list, size_t index)
|
|
{
|
|
size_t n;
|
|
|
|
n = guard_list->count - index - 1;
|
|
if (n > 0)
|
|
memmove(guard_list->spans + index, guard_list->spans + index + 1, n *
|
|
sizeof(RE_GuardSpan));
|
|
--guard_list->count;
|
|
}
|
|
|
|
/* Checks whether a position is guarded against further matching. */
|
|
Py_LOCAL_INLINE(BOOL) is_guarded(RE_GuardList* guard_list, Py_ssize_t text_pos)
|
|
{
|
|
size_t low;
|
|
size_t high;
|
|
|
|
/* Is this position in the guard list? */
|
|
low = 0;
|
|
high = guard_list->count;
|
|
while (low < high) {
|
|
size_t mid;
|
|
RE_GuardSpan* span;
|
|
|
|
mid = (low + high) / 2;
|
|
span = &guard_list->spans[mid];
|
|
if (text_pos < span->low)
|
|
high = mid;
|
|
else if (text_pos > span->high)
|
|
low = mid + 1;
|
|
else
|
|
return span->protect;
|
|
}
|
|
|
|
guard_list->last_text_pos = text_pos;
|
|
guard_list->last_low = low;
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Guards a position against further matching. */
|
|
Py_LOCAL_INLINE(BOOL) guard(RE_SafeState* safe_state, RE_GuardList* guard_list,
|
|
Py_ssize_t text_pos, BOOL protect) {
|
|
size_t low;
|
|
size_t high;
|
|
|
|
/* Where should be new position be added? */
|
|
if (text_pos == guard_list->last_text_pos)
|
|
low = guard_list->last_low;
|
|
else {
|
|
low = 0;
|
|
high = guard_list->count;
|
|
while (low < high) {
|
|
size_t mid;
|
|
RE_GuardSpan* span;
|
|
|
|
mid = (low + high) / 2;
|
|
span = &guard_list->spans[mid];
|
|
if (text_pos < span->low)
|
|
high = mid;
|
|
else if (text_pos > span->high)
|
|
low = mid + 1;
|
|
else
|
|
return TRUE;
|
|
}
|
|
}
|
|
|
|
/* Add the position to the guard list. */
|
|
if (low > 0 && guard_list->spans[low - 1].high + 1 == text_pos &&
|
|
guard_list->spans[low - 1].protect == protect) {
|
|
/* The new position is just above this span. */
|
|
if (low < guard_list->count && guard_list->spans[low].low - 1 ==
|
|
text_pos && guard_list->spans[low].protect == protect) {
|
|
/* The new position joins 2 spans */
|
|
guard_list->spans[low - 1].high = guard_list->spans[low].high;
|
|
delete_guard_span(guard_list, low);
|
|
} else
|
|
/* Extend the span. */
|
|
guard_list->spans[low - 1].high = text_pos;
|
|
} else if (low < guard_list->count && guard_list->spans[low].low - 1 ==
|
|
text_pos && guard_list->spans[low].protect == protect)
|
|
/* The new position is just below this span. */
|
|
/* Extend the span. */
|
|
guard_list->spans[low].low = text_pos;
|
|
else {
|
|
/* Insert a new span. */
|
|
if (!insert_guard_span(safe_state, guard_list, low))
|
|
return FALSE;
|
|
guard_list->spans[low].low = text_pos;
|
|
guard_list->spans[low].high = text_pos;
|
|
guard_list->spans[low].protect = protect;
|
|
}
|
|
|
|
guard_list->last_text_pos = -1;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Guards a position against further matching for a repeat. */
|
|
Py_LOCAL_INLINE(BOOL) guard_repeat(RE_SafeState* safe_state, size_t index,
|
|
Py_ssize_t text_pos, RE_STATUS_T guard_type, BOOL protect) {
|
|
RE_State* state;
|
|
RE_GuardList* guard_list;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
/* Is a guard active here? */
|
|
if (!(state->pattern->repeat_info[index].status & guard_type))
|
|
return TRUE;
|
|
|
|
/* Which guard list? */
|
|
if (guard_type & RE_STATUS_BODY)
|
|
guard_list = &state->repeats[index].body_guard_list;
|
|
else
|
|
guard_list = &state->repeats[index].tail_guard_list;
|
|
|
|
return guard(safe_state, guard_list, text_pos, protect);
|
|
}
|
|
|
|
/* Checks whether a position is guarded against further matching for a repeat.
|
|
*/
|
|
Py_LOCAL_INLINE(BOOL) is_repeat_guarded(RE_SafeState* safe_state, size_t index,
|
|
Py_ssize_t text_pos, RE_STATUS_T guard_type) {
|
|
RE_State* state;
|
|
RE_GuardList* guard_list;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
/* Is a guard active here? */
|
|
if (!(state->pattern->repeat_info[index].status & guard_type))
|
|
return FALSE;
|
|
|
|
/* Which guard list? */
|
|
if (guard_type == RE_STATUS_BODY)
|
|
guard_list = &state->repeats[index].body_guard_list;
|
|
else
|
|
guard_list = &state->repeats[index].tail_guard_list;
|
|
|
|
return is_guarded(guard_list, text_pos);
|
|
}
|
|
|
|
/* Resets the guards inside atomic subpatterns and lookarounds. */
|
|
Py_LOCAL_INLINE(void) reset_guards(RE_State* state, RE_CODE* values) {
|
|
PatternObject* pattern;
|
|
size_t repeat_count;
|
|
|
|
pattern = state->pattern;
|
|
repeat_count = pattern->repeat_count;
|
|
|
|
if (values) {
|
|
size_t i;
|
|
|
|
for (i = 1; i <= values[0]; i++) {
|
|
size_t index;
|
|
|
|
index = values[i];
|
|
|
|
if (index < repeat_count) {
|
|
reset_guard_list(&state->repeats[index].body_guard_list);
|
|
reset_guard_list(&state->repeats[index].tail_guard_list);
|
|
} else {
|
|
index -= repeat_count;
|
|
|
|
reset_guard_list(&state->fuzzy_guards[index].body_guard_list);
|
|
reset_guard_list(&state->fuzzy_guards[index].tail_guard_list);
|
|
}
|
|
}
|
|
} else {
|
|
size_t index;
|
|
size_t fuzzy_count;
|
|
|
|
for (index = 0; index < repeat_count; index++) {
|
|
reset_guard_list(&state->repeats[index].body_guard_list);
|
|
reset_guard_list(&state->repeats[index].tail_guard_list);
|
|
}
|
|
|
|
fuzzy_count = pattern->fuzzy_count;
|
|
|
|
for (index = 0; index < fuzzy_count; index++) {
|
|
reset_guard_list(&state->fuzzy_guards[index].body_guard_list);
|
|
reset_guard_list(&state->fuzzy_guards[index].tail_guard_list);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Builds a Unicode string. */
|
|
Py_LOCAL_INLINE(PyObject*) build_unicode_value(void* buffer, Py_ssize_t len,
|
|
Py_ssize_t buffer_charsize) {
|
|
return PyUnicode_FromUnicode(buffer, len);
|
|
}
|
|
|
|
/* Builds a bytestring. Returns NULL if any member is too wide. */
|
|
Py_LOCAL_INLINE(PyObject*) build_bytes_value(void* buffer, Py_ssize_t len,
|
|
Py_ssize_t buffer_charsize)
|
|
{
|
|
Py_UCS1* byte_buffer;
|
|
Py_ssize_t i;
|
|
PyObject* result;
|
|
|
|
if (buffer_charsize == 1)
|
|
return Py_BuildValue("s#", buffer, len);
|
|
|
|
byte_buffer = re_alloc((size_t)len);
|
|
if (!byte_buffer)
|
|
return NULL;
|
|
|
|
for (i = 0; i < len; i++) {
|
|
Py_UCS2 c = ((Py_UCS2*)buffer)[i];
|
|
if (c > 0xFF)
|
|
goto too_wide;
|
|
|
|
byte_buffer[i] = (Py_UCS1)c;
|
|
}
|
|
|
|
result = Py_BuildValue("s#", byte_buffer, len);
|
|
|
|
re_dealloc(byte_buffer);
|
|
|
|
return result;
|
|
|
|
too_wide:
|
|
re_dealloc(byte_buffer);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Looks for a string in a string set. */
|
|
Py_LOCAL_INLINE(int) string_set_contains(RE_State* state, PyObject* string_set,
|
|
Py_ssize_t first, Py_ssize_t last) {
|
|
PyObject* string;
|
|
int status;
|
|
|
|
if (state->is_unicode)
|
|
string = build_unicode_value(state->point_to(state->text, first), last
|
|
- first, state->charsize);
|
|
else
|
|
string = build_bytes_value(state->point_to(state->text, first), last -
|
|
first, state->charsize);
|
|
if (!string)
|
|
return RE_ERROR_INTERNAL;
|
|
|
|
status = PySet_Contains(string_set, string);
|
|
Py_DECREF(string);
|
|
|
|
return status;
|
|
}
|
|
|
|
/* Looks for a string in a string set, ignoring case. */
|
|
Py_LOCAL_INLINE(int) string_set_contains_ign(RE_State* state, PyObject*
|
|
string_set, void* buffer, Py_ssize_t index, Py_ssize_t len, Py_ssize_t
|
|
buffer_charsize) {
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
|
|
RE_EncodingTable* encoding;
|
|
BOOL (*possible_turkic)(Py_UCS4 ch);
|
|
Py_UCS4 codepoints[4];
|
|
|
|
switch (buffer_charsize) {
|
|
case 1:
|
|
char_at = bytes1_char_at;
|
|
set_char_at = bytes1_set_char_at;
|
|
break;
|
|
case 2:
|
|
char_at = bytes2_char_at;
|
|
set_char_at = bytes2_set_char_at;
|
|
break;
|
|
case 4:
|
|
char_at = bytes4_char_at;
|
|
set_char_at = bytes4_set_char_at;
|
|
break;
|
|
default:
|
|
char_at = bytes1_char_at;
|
|
set_char_at = bytes1_set_char_at;
|
|
break;
|
|
}
|
|
|
|
encoding = state->encoding;
|
|
possible_turkic = encoding->possible_turkic;
|
|
|
|
/* Look for a possible Turkic 'I'. */
|
|
while (index < len && !possible_turkic(char_at(buffer, index)))
|
|
++index;
|
|
|
|
if (index < len) {
|
|
/* Possible Turkic 'I'. */
|
|
int count;
|
|
int i;
|
|
|
|
/* Try all the alternatives to the 'I'. */
|
|
count = encoding->all_turkic_i(char_at(buffer, index), codepoints);
|
|
|
|
for (i = 0; i < count; i++) {
|
|
int status;
|
|
|
|
set_char_at(buffer, index, codepoints[i]);
|
|
|
|
/* Recurse for the remainder of the string. */
|
|
status = string_set_contains_ign(state, string_set, buffer, index +
|
|
1, len, buffer_charsize);
|
|
if (status != 0)
|
|
return status;
|
|
}
|
|
|
|
return 0;
|
|
} else {
|
|
/* No Turkic 'I'. */
|
|
PyObject* string;
|
|
int status;
|
|
|
|
if (state->is_unicode)
|
|
string = build_unicode_value(buffer, len, buffer_charsize);
|
|
else
|
|
string = build_bytes_value(buffer, len, buffer_charsize);
|
|
if (!string)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
status = PySet_Contains(string_set, string);
|
|
Py_DECREF(string);
|
|
|
|
return status;
|
|
}
|
|
}
|
|
|
|
/* Creates a partial string set for truncation at the left or right side. */
|
|
Py_LOCAL_INLINE(int) make_partial_string_set(RE_State* state, RE_Node* node) {
|
|
PatternObject* pattern;
|
|
int partial_side;
|
|
PyObject* string_set;
|
|
PyObject* partial_set;
|
|
PyObject* iter = NULL;
|
|
PyObject* item = NULL;
|
|
PyObject* slice = NULL;
|
|
|
|
pattern = state->pattern;
|
|
partial_side = state->partial_side;
|
|
if (partial_side != RE_PARTIAL_LEFT && partial_side != RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_INTERNAL;
|
|
|
|
/* Fetch the full string set. PyList_GET_ITEM borrows a reference. */
|
|
string_set = PyList_GET_ITEM(pattern->named_list_indexes, node->values[0]);
|
|
if (!string_set)
|
|
return RE_ERROR_INTERNAL;
|
|
|
|
/* Gets the list of partial string sets. */
|
|
if (!pattern->partial_named_lists[partial_side]) {
|
|
size_t size;
|
|
|
|
size = pattern->named_lists_count * sizeof(PyObject*);
|
|
pattern->partial_named_lists[partial_side] = re_alloc(size);
|
|
if (!pattern->partial_named_lists[partial_side])
|
|
return RE_ERROR_INTERNAL;
|
|
|
|
memset(pattern->partial_named_lists[partial_side], 0, size);
|
|
}
|
|
|
|
/* Get the partial string set. */
|
|
partial_set = pattern->partial_named_lists[partial_side][node->values[0]];
|
|
if (partial_set)
|
|
return 1;
|
|
|
|
/* Build the partial string set. */
|
|
partial_set = PySet_New(NULL);
|
|
if (!partial_set)
|
|
return RE_ERROR_INTERNAL;
|
|
|
|
iter = PyObject_GetIter(string_set);
|
|
if (!iter)
|
|
goto error;
|
|
|
|
item = PyIter_Next(iter);
|
|
|
|
while (item) {
|
|
Py_ssize_t len;
|
|
Py_ssize_t first;
|
|
Py_ssize_t last;
|
|
|
|
len = PySequence_Length(item);
|
|
if (len == -1)
|
|
goto error;
|
|
|
|
first = 0;
|
|
last = len;
|
|
|
|
while (last - first > 1) {
|
|
int status;
|
|
|
|
/* Shorten the entry. */
|
|
if (partial_side == RE_PARTIAL_LEFT)
|
|
++first;
|
|
else
|
|
--last;
|
|
|
|
slice = PySequence_GetSlice(item, first, last);
|
|
if (!slice)
|
|
goto error;
|
|
|
|
status = PySet_Add(partial_set, slice);
|
|
Py_DECREF(slice);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
|
|
Py_DECREF(item);
|
|
item = PyIter_Next(iter);
|
|
}
|
|
|
|
if (PyErr_Occurred())
|
|
goto error;
|
|
|
|
Py_DECREF(iter);
|
|
|
|
pattern->partial_named_lists[partial_side][node->values[0]] = partial_set;
|
|
|
|
return 1;
|
|
|
|
error:
|
|
Py_XDECREF(item);
|
|
Py_XDECREF(iter);
|
|
Py_DECREF(partial_set);
|
|
|
|
return RE_ERROR_INTERNAL;
|
|
}
|
|
|
|
/* Tries to match a string at the current position with a member of a string
|
|
* set, forwards or backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) string_set_match_fwdrev(RE_SafeState* safe_state, RE_Node*
|
|
node, BOOL reverse) {
|
|
RE_State* state;
|
|
Py_ssize_t min_len;
|
|
Py_ssize_t max_len;
|
|
Py_ssize_t text_available;
|
|
Py_ssize_t slice_available;
|
|
int partial_side;
|
|
Py_ssize_t len;
|
|
Py_ssize_t first;
|
|
Py_ssize_t last;
|
|
int status;
|
|
PyObject* string_set;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
min_len = (Py_ssize_t)node->values[1];
|
|
max_len = (Py_ssize_t)node->values[2];
|
|
|
|
acquire_GIL(safe_state);
|
|
|
|
if (reverse) {
|
|
text_available = state->text_pos;
|
|
slice_available = state->text_pos - state->slice_start;
|
|
partial_side = RE_PARTIAL_LEFT;
|
|
} else {
|
|
text_available = state->text_length - state->text_pos;
|
|
slice_available = state->slice_end - state->text_pos;
|
|
partial_side = RE_PARTIAL_RIGHT;
|
|
}
|
|
|
|
/* Get as many characters as we need for the longest possible match. */
|
|
len = min_ssize_t(max_len, slice_available);
|
|
|
|
if (reverse) {
|
|
first = state->text_pos - len;
|
|
last = state->text_pos;
|
|
} else {
|
|
first = state->text_pos;
|
|
last = state->text_pos + len;
|
|
}
|
|
|
|
/* If we didn't get all of the characters we need, is a partial match
|
|
* allowed?
|
|
*/
|
|
if (len < max_len && len == text_available && state->partial_side ==
|
|
partial_side) {
|
|
if (len == 0) {
|
|
/* An empty string is always a possible partial match. */
|
|
status = RE_ERROR_PARTIAL;
|
|
goto finished;
|
|
}
|
|
|
|
/* Make a set of the possible partial matches. */
|
|
status = make_partial_string_set(state, node);
|
|
if (status < 0)
|
|
goto finished;
|
|
|
|
/* Fetch the partial string set. */
|
|
string_set =
|
|
state->pattern->partial_named_lists[partial_side][node->values[0]];
|
|
|
|
/* Is the text we have a partial match? */
|
|
status = string_set_contains(state, string_set, first, last);
|
|
if (status < 0)
|
|
goto finished;
|
|
|
|
if (status == 1) {
|
|
/* Advance past the match. */
|
|
if (reverse)
|
|
state->text_pos -= len;
|
|
else
|
|
state->text_pos += len;
|
|
|
|
status = RE_ERROR_PARTIAL;
|
|
goto finished;
|
|
}
|
|
}
|
|
|
|
/* Fetch the string set. PyList_GET_ITEM borrows a reference. */
|
|
string_set = PyList_GET_ITEM(state->pattern->named_list_indexes,
|
|
node->values[0]);
|
|
if (!string_set) {
|
|
status = RE_ERROR_INTERNAL;
|
|
goto finished;
|
|
}
|
|
|
|
/* We've already looked for a partial match (if allowed), but what about a
|
|
* complete match?
|
|
*/
|
|
while (len >= min_len) {
|
|
status = string_set_contains(state, string_set, first, last);
|
|
|
|
if (status == 1) {
|
|
/* Advance past the match. */
|
|
if (reverse)
|
|
state->text_pos -= len;
|
|
else
|
|
state->text_pos += len;
|
|
|
|
status = 1;
|
|
goto finished;
|
|
}
|
|
|
|
/* Look for a shorter match. */
|
|
--len;
|
|
if (reverse)
|
|
++first;
|
|
else
|
|
--last;
|
|
}
|
|
|
|
/* No match. */
|
|
status = 0;
|
|
|
|
finished:
|
|
release_GIL(safe_state);
|
|
|
|
return status;
|
|
}
|
|
|
|
/* Tries to match a string at the current position with a member of a string
|
|
* set, ignoring case, forwards or backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) string_set_match_fld_fwdrev(RE_SafeState* safe_state,
|
|
RE_Node* node, BOOL reverse) {
|
|
RE_State* state;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
Py_ssize_t folded_charsize;
|
|
void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
|
|
Py_ssize_t min_len;
|
|
Py_ssize_t max_len;
|
|
Py_ssize_t buf_len;
|
|
void* folded;
|
|
int status;
|
|
BOOL* end_of_fold = NULL;
|
|
Py_ssize_t text_available;
|
|
Py_ssize_t slice_available;
|
|
Py_ssize_t t_pos;
|
|
Py_ssize_t f_pos;
|
|
int step;
|
|
int partial_side;
|
|
Py_ssize_t len;
|
|
Py_ssize_t consumed;
|
|
Py_UCS4 codepoints[RE_MAX_FOLDED];
|
|
PyObject* string_set;
|
|
Py_ssize_t first;
|
|
Py_ssize_t last;
|
|
|
|
state = safe_state->re_state;
|
|
full_case_fold = state->encoding->full_case_fold;
|
|
char_at = state->char_at;
|
|
|
|
/* The folded string will have the same width as the original string. */
|
|
folded_charsize = state->charsize;
|
|
|
|
switch (folded_charsize) {
|
|
case 1:
|
|
set_char_at = bytes1_set_char_at;
|
|
break;
|
|
case 2:
|
|
set_char_at = bytes2_set_char_at;
|
|
break;
|
|
case 4:
|
|
set_char_at = bytes4_set_char_at;
|
|
break;
|
|
default:
|
|
return RE_ERROR_INTERNAL;
|
|
}
|
|
|
|
min_len = (Py_ssize_t)node->values[1];
|
|
max_len = (Py_ssize_t)node->values[2];
|
|
|
|
acquire_GIL(safe_state);
|
|
|
|
/* Allocate a buffer for the folded string. */
|
|
buf_len = max_len + RE_MAX_FOLDED;
|
|
folded = re_alloc((size_t)(buf_len * folded_charsize));
|
|
if (!folded) {
|
|
status = RE_ERROR_MEMORY;
|
|
goto finished;
|
|
}
|
|
|
|
end_of_fold = re_alloc((size_t)buf_len * sizeof(BOOL));
|
|
if (!end_of_fold) {
|
|
status = RE_ERROR_MEMORY;
|
|
goto finished;
|
|
}
|
|
|
|
memset(end_of_fold, 0, (size_t)buf_len * sizeof(BOOL));
|
|
|
|
if (reverse) {
|
|
text_available = state->text_pos;
|
|
slice_available = state->text_pos - state->slice_start;
|
|
t_pos = state->text_pos - 1;
|
|
f_pos = buf_len;
|
|
step = -1;
|
|
partial_side = RE_PARTIAL_LEFT;
|
|
} else {
|
|
text_available = state->text_length - state->text_pos;
|
|
slice_available = state->slice_end - state->text_pos;
|
|
t_pos = state->text_pos;
|
|
f_pos = 0;
|
|
step = 1;
|
|
partial_side = RE_PARTIAL_RIGHT;
|
|
}
|
|
|
|
/* We can stop getting characters as soon as the case-folded string is long
|
|
* enough (each codepoint from the text can expand to more than one folded
|
|
* codepoint).
|
|
*/
|
|
len = 0;
|
|
end_of_fold[len] = TRUE;
|
|
|
|
consumed = 0;
|
|
while (len < max_len && consumed < slice_available) {
|
|
int count;
|
|
int j;
|
|
|
|
count = full_case_fold(char_at(state->text, t_pos), codepoints);
|
|
|
|
if (reverse)
|
|
f_pos -= count;
|
|
|
|
for (j = 0; j < count; j++)
|
|
set_char_at(folded, f_pos + j, codepoints[j]);
|
|
|
|
if (!reverse)
|
|
f_pos += count;
|
|
|
|
len += count;
|
|
end_of_fold[len] = TRUE;
|
|
++consumed;
|
|
t_pos += step;
|
|
}
|
|
|
|
if (reverse) {
|
|
first = f_pos;
|
|
last = buf_len;
|
|
} else {
|
|
first = 0;
|
|
last = f_pos;
|
|
}
|
|
|
|
/* If we didn't get all of the characters we need, is a partial match
|
|
* allowed?
|
|
*/
|
|
if (len < max_len && len == text_available && state->partial_side ==
|
|
partial_side) {
|
|
if (len == 0) {
|
|
/* An empty string is always a possible partial match. */
|
|
status = RE_ERROR_PARTIAL;
|
|
goto finished;
|
|
}
|
|
|
|
/* Make a set of the possible partial matches. */
|
|
status = make_partial_string_set(state, node);
|
|
if (status < 0)
|
|
goto finished;
|
|
|
|
/* Fetch the partial string set. */
|
|
string_set =
|
|
state->pattern->partial_named_lists[partial_side][node->values[0]];
|
|
|
|
/* Is the text we have a partial match? */
|
|
status = string_set_contains_ign(state, string_set, folded, first,
|
|
last, folded_charsize);
|
|
if (status < 0)
|
|
goto finished;
|
|
|
|
if (status == 1) {
|
|
/* Advance past the match. */
|
|
if (reverse)
|
|
state->text_pos -= consumed;
|
|
else
|
|
state->text_pos += consumed;
|
|
|
|
status = RE_ERROR_PARTIAL;
|
|
goto finished;
|
|
}
|
|
}
|
|
|
|
/* Fetch the string set. PyList_GET_ITEM borrows a reference. */
|
|
string_set = PyList_GET_ITEM(state->pattern->named_list_indexes,
|
|
node->values[0]);
|
|
if (!string_set) {
|
|
status = RE_ERROR_INTERNAL;
|
|
goto finished;
|
|
}
|
|
|
|
/* We've already looked for a partial match (if allowed), but what about a
|
|
* complete match?
|
|
*/
|
|
while (len >= min_len) {
|
|
if (end_of_fold[len]) {
|
|
status = string_set_contains_ign(state, string_set, folded, first,
|
|
last, folded_charsize);
|
|
|
|
if (status == 1) {
|
|
/* Advance past the match. */
|
|
if (reverse)
|
|
state->text_pos -= consumed;
|
|
else
|
|
state->text_pos += consumed;
|
|
|
|
status = 1;
|
|
goto finished;
|
|
}
|
|
|
|
--consumed;
|
|
}
|
|
|
|
/* Look for a shorter match. */
|
|
--len;
|
|
if (reverse)
|
|
++first;
|
|
else
|
|
--last;
|
|
}
|
|
|
|
/* No match. */
|
|
status = 0;
|
|
|
|
finished:
|
|
re_dealloc(end_of_fold);
|
|
re_dealloc(folded);
|
|
|
|
release_GIL(safe_state);
|
|
|
|
return status;
|
|
}
|
|
|
|
/* Tries to match a string at the current position with a member of a string
|
|
* set, ignoring case, forwards or backwards.
|
|
*/
|
|
Py_LOCAL_INLINE(int) string_set_match_ign_fwdrev(RE_SafeState* safe_state,
|
|
RE_Node* node, BOOL reverse) {
|
|
RE_State* state;
|
|
Py_UCS4 (*simple_case_fold)(Py_UCS4 ch);
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
Py_ssize_t folded_charsize;
|
|
void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
|
|
Py_ssize_t min_len;
|
|
Py_ssize_t max_len;
|
|
void* folded;
|
|
int status;
|
|
Py_ssize_t text_available;
|
|
Py_ssize_t slice_available;
|
|
Py_ssize_t t_pos;
|
|
Py_ssize_t f_pos;
|
|
int step;
|
|
int partial_side;
|
|
Py_ssize_t len;
|
|
Py_ssize_t i;
|
|
Py_ssize_t first;
|
|
Py_ssize_t last;
|
|
PyObject* string_set;
|
|
|
|
state = safe_state->re_state;
|
|
simple_case_fold = state->encoding->simple_case_fold;
|
|
char_at = state->char_at;
|
|
|
|
/* The folded string will have the same width as the original string. */
|
|
folded_charsize = state->charsize;
|
|
|
|
switch (folded_charsize) {
|
|
case 1:
|
|
set_char_at = bytes1_set_char_at;
|
|
break;
|
|
case 2:
|
|
set_char_at = bytes2_set_char_at;
|
|
break;
|
|
case 4:
|
|
set_char_at = bytes4_set_char_at;
|
|
break;
|
|
default:
|
|
return RE_ERROR_INTERNAL;
|
|
}
|
|
|
|
min_len = (Py_ssize_t)node->values[1];
|
|
max_len = (Py_ssize_t)node->values[2];
|
|
|
|
acquire_GIL(safe_state);
|
|
|
|
/* Allocate a buffer for the folded string. */
|
|
folded = re_alloc((size_t)(max_len * folded_charsize));
|
|
if (!folded) {
|
|
status = RE_ERROR_MEMORY;
|
|
goto finished;
|
|
}
|
|
|
|
if (reverse) {
|
|
text_available = state->text_pos;
|
|
slice_available = state->text_pos - state->slice_start;
|
|
t_pos = state->text_pos - 1;
|
|
f_pos = max_len - 1;
|
|
step = -1;
|
|
partial_side = RE_PARTIAL_LEFT;
|
|
} else {
|
|
text_available = state->text_length - state->text_pos;
|
|
slice_available = state->slice_end - state->text_pos;
|
|
t_pos = state->text_pos;
|
|
f_pos = 0;
|
|
step = 1;
|
|
partial_side = RE_PARTIAL_RIGHT;
|
|
}
|
|
|
|
/* Get as many characters as we need for the longest possible match. */
|
|
len = min_ssize_t(max_len, slice_available);
|
|
|
|
for (i = 0; i < len; i ++) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = simple_case_fold(char_at(state->text, t_pos));
|
|
set_char_at(folded, f_pos, ch);
|
|
t_pos += step;
|
|
f_pos += step;
|
|
}
|
|
|
|
if (reverse) {
|
|
first = f_pos;
|
|
last = max_len;
|
|
} else {
|
|
first = 0;
|
|
last = f_pos;
|
|
}
|
|
|
|
/* If we didn't get all of the characters we need, is a partial match
|
|
* allowed?
|
|
*/
|
|
if (len < max_len && len == text_available && state->partial_side ==
|
|
partial_side) {
|
|
if (len == 0) {
|
|
/* An empty string is always a possible partial match. */
|
|
status = RE_ERROR_PARTIAL;
|
|
goto finished;
|
|
}
|
|
|
|
/* Make a set of the possible partial matches. */
|
|
status = make_partial_string_set(state, node);
|
|
if (status < 0)
|
|
goto finished;
|
|
|
|
/* Fetch the partial string set. */
|
|
string_set =
|
|
state->pattern->partial_named_lists[partial_side][node->values[0]];
|
|
|
|
/* Is the text we have a partial match? */
|
|
status = string_set_contains_ign(state, string_set, folded, first,
|
|
last, folded_charsize);
|
|
if (status < 0)
|
|
goto finished;
|
|
|
|
if (status == 1) {
|
|
/* Advance past the match. */
|
|
if (reverse)
|
|
state->text_pos -= len;
|
|
else
|
|
state->text_pos += len;
|
|
|
|
status = RE_ERROR_PARTIAL;
|
|
goto finished;
|
|
}
|
|
}
|
|
|
|
/* Fetch the string set. PyList_GET_ITEM borrows a reference. */
|
|
string_set = PyList_GET_ITEM(state->pattern->named_list_indexes,
|
|
node->values[0]);
|
|
if (!string_set) {
|
|
status = RE_ERROR_INTERNAL;
|
|
goto finished;
|
|
}
|
|
|
|
/* We've already looked for a partial match (if allowed), but what about a
|
|
* complete match?
|
|
*/
|
|
while (len >= min_len) {
|
|
status = string_set_contains_ign(state, string_set, folded, first,
|
|
last, folded_charsize);
|
|
|
|
if (status == 1) {
|
|
/* Advance past the match. */
|
|
if (reverse)
|
|
state->text_pos -= len;
|
|
else
|
|
state->text_pos += len;
|
|
|
|
status = 1;
|
|
goto finished;
|
|
}
|
|
|
|
/* Look for a shorter match. */
|
|
--len;
|
|
if (reverse)
|
|
++first;
|
|
else
|
|
--last;
|
|
}
|
|
|
|
/* No match. */
|
|
status = 0;
|
|
|
|
finished:
|
|
re_dealloc(folded);
|
|
|
|
release_GIL(safe_state);
|
|
|
|
return status;
|
|
}
|
|
|
|
/* Checks whether any additional fuzzy error is permitted. */
|
|
Py_LOCAL_INLINE(BOOL) any_error_permitted(RE_State* state) {
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
return fuzzy_info->total_cost <= values[RE_FUZZY_VAL_MAX_COST] &&
|
|
fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MAX_ERR] &&
|
|
state->total_cost <= state->max_cost;
|
|
}
|
|
|
|
/* Checks whether this additional fuzzy error is permitted. */
|
|
Py_LOCAL_INLINE(BOOL) this_error_permitted(RE_State* state, int fuzzy_type) {
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
return fuzzy_info->total_cost + values[RE_FUZZY_VAL_COST_BASE + fuzzy_type]
|
|
<= values[RE_FUZZY_VAL_MAX_COST] && fuzzy_info->counts[fuzzy_type] <
|
|
values[RE_FUZZY_VAL_MAX_BASE + fuzzy_type] && state->total_cost +
|
|
values[RE_FUZZY_VAL_COST_BASE + fuzzy_type] <= state->max_cost;
|
|
}
|
|
|
|
/* Checks whether we've reachsd the end of the text during a fuzzy partial
|
|
* match.
|
|
*/
|
|
Py_LOCAL_INLINE(int) check_fuzzy_partial(RE_State* state, Py_ssize_t text_pos)
|
|
{
|
|
switch (state->partial_side) {
|
|
case RE_PARTIAL_LEFT:
|
|
if (text_pos < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
break;
|
|
case RE_PARTIAL_RIGHT:
|
|
if (text_pos > state->text_length)
|
|
return RE_ERROR_PARTIAL;
|
|
break;
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
/* Checks a fuzzy match of an item. */
|
|
Py_LOCAL_INLINE(int) next_fuzzy_match_item(RE_State* state, RE_FuzzyData* data,
|
|
BOOL is_string, int step) {
|
|
Py_ssize_t new_pos;
|
|
|
|
if (this_error_permitted(state, data->fuzzy_type)) {
|
|
switch (data->fuzzy_type) {
|
|
case RE_FUZZY_DEL:
|
|
/* Could a character at text_pos have been deleted? */
|
|
if (is_string)
|
|
data->new_string_pos += step;
|
|
else
|
|
data->new_node = data->new_node->next_1.node;
|
|
return RE_ERROR_SUCCESS;
|
|
case RE_FUZZY_INS:
|
|
/* Could the character at text_pos have been inserted? */
|
|
if (!data->permit_insertion)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
new_pos = data->new_text_pos + step;
|
|
if (state->slice_start <= new_pos && new_pos <= state->slice_end) {
|
|
data->new_text_pos = new_pos;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
return check_fuzzy_partial(state, new_pos);
|
|
case RE_FUZZY_SUB:
|
|
/* Could the character at text_pos have been substituted? */
|
|
new_pos = data->new_text_pos + step;
|
|
if (state->slice_start <= new_pos && new_pos <= state->slice_end) {
|
|
data->new_text_pos = new_pos;
|
|
if (is_string)
|
|
data->new_string_pos += step;
|
|
else
|
|
data->new_node = data->new_node->next_1.node;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
return check_fuzzy_partial(state, new_pos);
|
|
}
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
/* Tries a fuzzy match of an item of width 0 or 1. */
|
|
Py_LOCAL_INLINE(int) fuzzy_match_item(RE_SafeState* safe_state, BOOL search,
|
|
Py_ssize_t* text_pos, RE_Node** node, int step) {
|
|
RE_State* state;
|
|
RE_FuzzyData data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
RE_BacktrackData* bt_data;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
if (!any_error_permitted(state)) {
|
|
*node = NULL;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
data.new_text_pos = *text_pos;
|
|
data.new_node = *node;
|
|
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
if (step == 0) {
|
|
if (data.new_node->status & RE_STATUS_REVERSE) {
|
|
data.step = -1;
|
|
data.limit = state->slice_start;
|
|
} else {
|
|
data.step = 1;
|
|
data.limit = state->slice_end;
|
|
}
|
|
} else
|
|
data.step = step;
|
|
|
|
/* Permit insertion except initially when searching (it's better just to
|
|
* start searching one character later).
|
|
*/
|
|
data.permit_insertion = !search || data.new_text_pos !=
|
|
state->search_anchor;
|
|
|
|
for (data.fuzzy_type = 0; data.fuzzy_type < RE_FUZZY_COUNT;
|
|
data.fuzzy_type++) {
|
|
int status;
|
|
|
|
status = next_fuzzy_match_item(state, &data, FALSE, step);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
goto found;
|
|
}
|
|
|
|
*node = NULL;
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
found:
|
|
if (!add_backtrack(safe_state, (*node)->op))
|
|
return RE_ERROR_FAILURE;
|
|
bt_data = state->backtrack;
|
|
bt_data->fuzzy_item.position.text_pos = *text_pos;
|
|
bt_data->fuzzy_item.position.node = *node;
|
|
bt_data->fuzzy_item.fuzzy_type = (RE_INT8)data.fuzzy_type;
|
|
bt_data->fuzzy_item.step = (RE_INT8)step;
|
|
|
|
++fuzzy_info->counts[data.fuzzy_type];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
*text_pos = data.new_text_pos;
|
|
*node = data.new_node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Retries a fuzzy match of a item of width 0 or 1. */
|
|
Py_LOCAL_INLINE(int) retry_fuzzy_match_item(RE_SafeState* safe_state, BOOL
|
|
search, Py_ssize_t* text_pos, RE_Node** node, BOOL advance) {
|
|
RE_State* state;
|
|
RE_FuzzyData data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
RE_BacktrackData* bt_data;
|
|
int step;
|
|
|
|
state = safe_state->re_state;
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
bt_data = state->backtrack;
|
|
data.new_text_pos = bt_data->fuzzy_item.position.text_pos;
|
|
data.new_node = bt_data->fuzzy_item.position.node;
|
|
data.fuzzy_type = bt_data->fuzzy_item.fuzzy_type;
|
|
data.step = bt_data->fuzzy_item.step;
|
|
|
|
if (data.fuzzy_type >= 0) {
|
|
--fuzzy_info->counts[data.fuzzy_type];
|
|
--fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost -= values[RE_FUZZY_VAL_COST_BASE +
|
|
data.fuzzy_type];
|
|
--state->total_errors;
|
|
state->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
}
|
|
|
|
/* Permit insertion except initially when searching (it's better just to
|
|
* start searching one character later).
|
|
*/
|
|
data.permit_insertion = !search || data.new_text_pos !=
|
|
state->search_anchor;
|
|
|
|
step = advance ? data.step : 0;
|
|
|
|
for (++data.fuzzy_type; data.fuzzy_type < RE_FUZZY_COUNT;
|
|
data.fuzzy_type++) {
|
|
int status;
|
|
|
|
status = next_fuzzy_match_item(state, &data, FALSE, step);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
goto found;
|
|
}
|
|
|
|
discard_backtrack(state);
|
|
*node = NULL;
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
found:
|
|
bt_data->fuzzy_item.fuzzy_type = (RE_INT8)data.fuzzy_type;
|
|
|
|
++fuzzy_info->counts[data.fuzzy_type];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
*text_pos = data.new_text_pos;
|
|
*node = data.new_node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Tries a fuzzy insertion. */
|
|
Py_LOCAL_INLINE(int) fuzzy_insert(RE_SafeState* safe_state, Py_ssize_t
|
|
text_pos, RE_Node* node) {
|
|
RE_State* state;
|
|
RE_BacktrackData* bt_data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
/* No insertion or deletion. */
|
|
if (!add_backtrack(safe_state, node->op))
|
|
return RE_ERROR_FAILURE;
|
|
bt_data = state->backtrack;
|
|
bt_data->fuzzy_insert.position.text_pos = text_pos;
|
|
bt_data->fuzzy_insert.position.node = node;
|
|
bt_data->fuzzy_insert.count = 0;
|
|
bt_data->fuzzy_insert.too_few_errors = state->too_few_errors;
|
|
bt_data->fuzzy_insert.fuzzy_node = node; /* END_FUZZY node. */
|
|
|
|
/* Check whether there are too few errors. */
|
|
fuzzy_info = &state->fuzzy_info;
|
|
|
|
/* The node in this case is the END_FUZZY node. */
|
|
values = node->values;
|
|
|
|
if (fuzzy_info->counts[RE_FUZZY_DEL] < values[RE_FUZZY_VAL_MIN_DEL] ||
|
|
fuzzy_info->counts[RE_FUZZY_INS] < values[RE_FUZZY_VAL_MIN_INS] ||
|
|
fuzzy_info->counts[RE_FUZZY_SUB] < values[RE_FUZZY_VAL_MIN_SUB] ||
|
|
fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MIN_ERR])
|
|
state->too_few_errors = RE_ERROR_SUCCESS;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Retries a fuzzy insertion. */
|
|
Py_LOCAL_INLINE(int) retry_fuzzy_insert(RE_SafeState* safe_state, Py_ssize_t*
|
|
text_pos, RE_Node** node) {
|
|
RE_State* state;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_BacktrackData* bt_data;
|
|
Py_ssize_t new_text_pos;
|
|
RE_Node* new_node;
|
|
int step;
|
|
Py_ssize_t limit;
|
|
RE_Node* fuzzy_node;
|
|
RE_CODE* values;
|
|
|
|
state = safe_state->re_state;
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
bt_data = state->backtrack;
|
|
new_text_pos = bt_data->fuzzy_insert.position.text_pos;
|
|
new_node = bt_data->fuzzy_insert.position.node;
|
|
|
|
if (new_node->status & RE_STATUS_REVERSE) {
|
|
step = -1;
|
|
limit = state->slice_start;
|
|
} else {
|
|
step = 1;
|
|
limit = state->slice_end;
|
|
}
|
|
|
|
/* Could the character at text_pos have been inserted? */
|
|
if (!this_error_permitted(state, RE_FUZZY_INS) || new_text_pos == limit) {
|
|
size_t count;
|
|
|
|
count = bt_data->fuzzy_insert.count;
|
|
|
|
fuzzy_info->counts[RE_FUZZY_INS] -= count;
|
|
fuzzy_info->counts[RE_FUZZY_ERR] -= count;
|
|
fuzzy_info->total_cost -= values[RE_FUZZY_VAL_INS_COST] * count;
|
|
state->total_errors -= count;
|
|
state->total_cost -= values[RE_FUZZY_VAL_INS_COST] * count;
|
|
state->too_few_errors = bt_data->fuzzy_insert.too_few_errors;
|
|
|
|
discard_backtrack(state);
|
|
*node = NULL;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
++bt_data->fuzzy_insert.count;
|
|
|
|
++fuzzy_info->counts[RE_FUZZY_INS];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_INS_COST];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_INS_COST];
|
|
|
|
/* Check whether there are too few errors. */
|
|
state->too_few_errors = bt_data->fuzzy_insert.too_few_errors;
|
|
fuzzy_node = bt_data->fuzzy_insert.fuzzy_node; /* END_FUZZY node. */
|
|
values = fuzzy_node->values;
|
|
if (fuzzy_info->counts[RE_FUZZY_DEL] < values[RE_FUZZY_VAL_MIN_DEL] ||
|
|
fuzzy_info->counts[RE_FUZZY_INS] < values[RE_FUZZY_VAL_MIN_INS] ||
|
|
fuzzy_info->counts[RE_FUZZY_SUB] < values[RE_FUZZY_VAL_MIN_SUB] ||
|
|
fuzzy_info->counts[RE_FUZZY_ERR] < values[RE_FUZZY_VAL_MIN_ERR])
|
|
state->too_few_errors = RE_ERROR_SUCCESS;
|
|
|
|
*text_pos = new_text_pos + step * (Py_ssize_t)bt_data->fuzzy_insert.count;
|
|
*node = new_node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Tries a fuzzy match of a string. */
|
|
Py_LOCAL_INLINE(int) fuzzy_match_string(RE_SafeState* safe_state, BOOL search,
|
|
Py_ssize_t* text_pos, RE_Node* node, Py_ssize_t* string_pos, BOOL* matched,
|
|
int step) {
|
|
RE_State* state;
|
|
RE_FuzzyData data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
RE_BacktrackData* bt_data;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
if (!any_error_permitted(state)) {
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
data.new_text_pos = *text_pos;
|
|
data.new_string_pos = *string_pos;
|
|
data.step = step;
|
|
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
/* Permit insertion except initially when searching (it's better just to
|
|
* start searching one character later).
|
|
*/
|
|
data.permit_insertion = !search || data.new_text_pos !=
|
|
state->search_anchor;
|
|
|
|
for (data.fuzzy_type = 0; data.fuzzy_type < RE_FUZZY_COUNT;
|
|
data.fuzzy_type++) {
|
|
int status;
|
|
|
|
status = next_fuzzy_match_item(state, &data, TRUE, data.step);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
goto found;
|
|
}
|
|
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
found:
|
|
if (!add_backtrack(safe_state, node->op))
|
|
return RE_ERROR_FAILURE;
|
|
bt_data = state->backtrack;
|
|
bt_data->fuzzy_string.position.text_pos = *text_pos;
|
|
bt_data->fuzzy_string.position.node = node;
|
|
bt_data->fuzzy_string.string_pos = *string_pos;
|
|
bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type;
|
|
bt_data->fuzzy_string.step = (RE_INT8)step;
|
|
|
|
++fuzzy_info->counts[data.fuzzy_type];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
*text_pos = data.new_text_pos;
|
|
*string_pos = data.new_string_pos;
|
|
*matched = TRUE;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Retries a fuzzy match of a string. */
|
|
Py_LOCAL_INLINE(int) retry_fuzzy_match_string(RE_SafeState* safe_state, BOOL
|
|
search, Py_ssize_t* text_pos, RE_Node** node, Py_ssize_t* string_pos, BOOL*
|
|
matched) {
|
|
RE_State* state;
|
|
RE_FuzzyData data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
RE_BacktrackData* bt_data;
|
|
RE_Node* new_node;
|
|
|
|
state = safe_state->re_state;
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
bt_data = state->backtrack;
|
|
data.new_text_pos = bt_data->fuzzy_string.position.text_pos;
|
|
new_node = bt_data->fuzzy_string.position.node;
|
|
data.new_string_pos = bt_data->fuzzy_string.string_pos;
|
|
data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type;
|
|
data.step = bt_data->fuzzy_string.step;
|
|
|
|
--fuzzy_info->counts[data.fuzzy_type];
|
|
--fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
--state->total_errors;
|
|
state->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
/* Permit insertion except initially when searching (it's better just to
|
|
* start searching one character later).
|
|
*/
|
|
data.permit_insertion = !search || data.new_text_pos !=
|
|
state->search_anchor;
|
|
|
|
for (++data.fuzzy_type; data.fuzzy_type < RE_FUZZY_COUNT;
|
|
data.fuzzy_type++) {
|
|
int status;
|
|
|
|
status = next_fuzzy_match_item(state, &data, TRUE, data.step);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
goto found;
|
|
}
|
|
|
|
discard_backtrack(state);
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
found:
|
|
bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type;
|
|
|
|
++fuzzy_info->counts[data.fuzzy_type];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
*text_pos = data.new_text_pos;
|
|
*node = new_node;
|
|
*string_pos = data.new_string_pos;
|
|
*matched = TRUE;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Checks a fuzzy match of a atring. */
|
|
Py_LOCAL_INLINE(int) next_fuzzy_match_string_fld(RE_State* state, RE_FuzzyData*
|
|
data) {
|
|
int new_pos;
|
|
|
|
if (this_error_permitted(state, data->fuzzy_type)) {
|
|
switch (data->fuzzy_type) {
|
|
case RE_FUZZY_DEL:
|
|
/* Could a character at text_pos have been deleted? */
|
|
data->new_string_pos += data->step;
|
|
return RE_ERROR_SUCCESS;
|
|
case RE_FUZZY_INS:
|
|
/* Could the character at text_pos have been inserted? */
|
|
if (!data->permit_insertion)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
new_pos = data->new_folded_pos + data->step;
|
|
if (0 <= new_pos && new_pos <= data->folded_len) {
|
|
data->new_folded_pos = new_pos;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
return check_fuzzy_partial(state, new_pos);
|
|
case RE_FUZZY_SUB:
|
|
/* Could the character at text_pos have been substituted? */
|
|
new_pos = data->new_folded_pos + data->step;
|
|
if (0 <= new_pos && new_pos <= data->folded_len) {
|
|
data->new_folded_pos = new_pos;
|
|
data->new_string_pos += data->step;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
return check_fuzzy_partial(state, new_pos);
|
|
}
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
/* Tries a fuzzy match of a string, ignoring case. */
|
|
Py_LOCAL_INLINE(int) fuzzy_match_string_fld(RE_SafeState* safe_state, BOOL
|
|
search, Py_ssize_t* text_pos, RE_Node* node, Py_ssize_t* string_pos, int*
|
|
folded_pos, int folded_len, BOOL* matched, int step) {
|
|
RE_State* state;
|
|
RE_FuzzyData data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
Py_ssize_t new_text_pos;
|
|
RE_BacktrackData* bt_data;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
if (!any_error_permitted(state)) {
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
new_text_pos = *text_pos;
|
|
data.new_string_pos = *string_pos;
|
|
data.new_folded_pos = *folded_pos;
|
|
data.folded_len = folded_len;
|
|
data.step = step;
|
|
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
/* Permit insertion except initially when searching (it's better just to
|
|
* start searching one character later).
|
|
*/
|
|
data.permit_insertion = !search || new_text_pos != state->search_anchor;
|
|
if (step > 0) {
|
|
if (data.new_folded_pos != 0)
|
|
data.permit_insertion = RE_ERROR_SUCCESS;
|
|
} else {
|
|
if (data.new_folded_pos != folded_len)
|
|
data.permit_insertion = RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
for (data.fuzzy_type = 0; data.fuzzy_type < RE_FUZZY_COUNT;
|
|
data.fuzzy_type++) {
|
|
int status;
|
|
|
|
status = next_fuzzy_match_string_fld(state, &data);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
goto found;
|
|
}
|
|
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
found:
|
|
if (!add_backtrack(safe_state, node->op))
|
|
return RE_ERROR_FAILURE;
|
|
bt_data = state->backtrack;
|
|
bt_data->fuzzy_string.position.text_pos = *text_pos;
|
|
bt_data->fuzzy_string.position.node = node;
|
|
bt_data->fuzzy_string.string_pos = *string_pos;
|
|
bt_data->fuzzy_string.folded_pos = (RE_INT8)(*folded_pos);
|
|
bt_data->fuzzy_string.folded_len = (RE_INT8)folded_len;
|
|
bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type;
|
|
bt_data->fuzzy_string.step = (RE_INT8)step;
|
|
|
|
++fuzzy_info->counts[data.fuzzy_type];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
*text_pos = new_text_pos;
|
|
*string_pos = data.new_string_pos;
|
|
*folded_pos = data.new_folded_pos;
|
|
*matched = TRUE;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Retries a fuzzy match of a string, ignoring case. */
|
|
Py_LOCAL_INLINE(int) retry_fuzzy_match_string_fld(RE_SafeState* safe_state,
|
|
BOOL search, Py_ssize_t* text_pos, RE_Node** node, Py_ssize_t* string_pos,
|
|
int* folded_pos, BOOL* matched) {
|
|
RE_State* state;
|
|
RE_FuzzyData data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
RE_BacktrackData* bt_data;
|
|
Py_ssize_t new_text_pos;
|
|
RE_Node* new_node;
|
|
|
|
state = safe_state->re_state;
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
bt_data = state->backtrack;
|
|
new_text_pos = bt_data->fuzzy_string.position.text_pos;
|
|
new_node = bt_data->fuzzy_string.position.node;
|
|
data.new_string_pos = bt_data->fuzzy_string.string_pos;
|
|
data.new_folded_pos = bt_data->fuzzy_string.folded_pos;
|
|
data.folded_len = bt_data->fuzzy_string.folded_len;
|
|
data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type;
|
|
data.step = bt_data->fuzzy_string.step;
|
|
|
|
--fuzzy_info->counts[data.fuzzy_type];
|
|
--fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
--state->total_errors;
|
|
state->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
/* Permit insertion except initially when searching (it's better just to
|
|
* start searching one character later).
|
|
*/
|
|
data.permit_insertion = !search || new_text_pos != state->search_anchor;
|
|
if (data.step > 0) {
|
|
if (data.new_folded_pos != 0)
|
|
data.permit_insertion = RE_ERROR_SUCCESS;
|
|
} else {
|
|
if (data.new_folded_pos != bt_data->fuzzy_string.folded_len)
|
|
data.permit_insertion = RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
for (++data.fuzzy_type; data.fuzzy_type < RE_FUZZY_COUNT;
|
|
data.fuzzy_type++) {
|
|
int status;
|
|
|
|
status = next_fuzzy_match_string_fld(state, &data);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
goto found;
|
|
}
|
|
|
|
discard_backtrack(state);
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
found:
|
|
bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type;
|
|
|
|
++fuzzy_info->counts[data.fuzzy_type];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
*text_pos = new_text_pos;
|
|
*node = new_node;
|
|
*string_pos = data.new_string_pos;
|
|
*folded_pos = data.new_folded_pos;
|
|
*matched = TRUE;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Checks a fuzzy match of a atring. */
|
|
Py_LOCAL_INLINE(int) next_fuzzy_match_group_fld(RE_State* state, RE_FuzzyData*
|
|
data) {
|
|
int new_pos;
|
|
|
|
if (this_error_permitted(state, data->fuzzy_type)) {
|
|
switch (data->fuzzy_type) {
|
|
case RE_FUZZY_DEL:
|
|
/* Could a character at text_pos have been deleted? */
|
|
data->new_gfolded_pos += data->step;
|
|
return RE_ERROR_SUCCESS;
|
|
case RE_FUZZY_INS:
|
|
/* Could the character at text_pos have been inserted? */
|
|
if (!data->permit_insertion)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
new_pos = data->new_folded_pos + data->step;
|
|
if (0 <= new_pos && new_pos <= data->folded_len) {
|
|
data->new_folded_pos = new_pos;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
return check_fuzzy_partial(state, new_pos);
|
|
case RE_FUZZY_SUB:
|
|
/* Could the character at text_pos have been substituted? */
|
|
new_pos = data->new_folded_pos + data->step;
|
|
if (0 <= new_pos && new_pos <= data->folded_len) {
|
|
data->new_folded_pos = new_pos;
|
|
data->new_gfolded_pos += data->step;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
return check_fuzzy_partial(state, new_pos);
|
|
}
|
|
}
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
/* Tries a fuzzy match of a group reference, ignoring case. */
|
|
Py_LOCAL_INLINE(int) fuzzy_match_group_fld(RE_SafeState* safe_state, BOOL
|
|
search, Py_ssize_t* text_pos, RE_Node* node, int* folded_pos, int folded_len,
|
|
Py_ssize_t* group_pos, int* gfolded_pos, int gfolded_len, BOOL* matched, int
|
|
step) {
|
|
RE_State* state;
|
|
RE_FuzzyData data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
Py_ssize_t new_text_pos;
|
|
Py_ssize_t new_group_pos;
|
|
RE_BacktrackData* bt_data;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
if (!any_error_permitted(state)) {
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
new_text_pos = *text_pos;
|
|
data.new_folded_pos = *folded_pos;
|
|
data.folded_len = folded_len;
|
|
new_group_pos = *group_pos;
|
|
data.new_gfolded_pos = *gfolded_pos;
|
|
data.step = step;
|
|
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
/* Permit insertion except initially when searching (it's better just to
|
|
* start searching one character later).
|
|
*/
|
|
data.permit_insertion = !search || new_text_pos != state->search_anchor;
|
|
if (data.step > 0) {
|
|
if (data.new_folded_pos != 0)
|
|
data.permit_insertion = RE_ERROR_SUCCESS;
|
|
} else {
|
|
if (data.new_folded_pos != folded_len)
|
|
data.permit_insertion = RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
for (data.fuzzy_type = 0; data.fuzzy_type < RE_FUZZY_COUNT;
|
|
data.fuzzy_type++) {
|
|
int status;
|
|
|
|
status = next_fuzzy_match_group_fld(state, &data);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
goto found;
|
|
}
|
|
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
found:
|
|
if (!add_backtrack(safe_state, node->op))
|
|
return RE_ERROR_FAILURE;
|
|
bt_data = state->backtrack;
|
|
bt_data->fuzzy_string.position.text_pos = *text_pos;
|
|
bt_data->fuzzy_string.position.node = node;
|
|
bt_data->fuzzy_string.string_pos = *group_pos;
|
|
bt_data->fuzzy_string.folded_pos = (RE_INT8)(*folded_pos);
|
|
bt_data->fuzzy_string.folded_len = (RE_INT8)folded_len;
|
|
bt_data->fuzzy_string.gfolded_pos = (RE_INT8)(*gfolded_pos);
|
|
bt_data->fuzzy_string.gfolded_len = (RE_INT8)gfolded_len;
|
|
bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type;
|
|
bt_data->fuzzy_string.step = (RE_INT8)step;
|
|
|
|
++fuzzy_info->counts[data.fuzzy_type];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
*text_pos = new_text_pos;
|
|
*group_pos = new_group_pos;
|
|
*folded_pos = data.new_folded_pos;
|
|
*gfolded_pos = data.new_gfolded_pos;
|
|
*matched = TRUE;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Retries a fuzzy match of a group reference, ignoring case. */
|
|
Py_LOCAL_INLINE(int) retry_fuzzy_match_group_fld(RE_SafeState* safe_state, BOOL
|
|
search, Py_ssize_t* text_pos, RE_Node** node, int* folded_pos, Py_ssize_t*
|
|
group_pos, int* gfolded_pos, BOOL* matched) {
|
|
RE_State* state;
|
|
RE_FuzzyData data;
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_CODE* values;
|
|
RE_BacktrackData* bt_data;
|
|
Py_ssize_t new_text_pos;
|
|
Py_ssize_t new_group_pos;
|
|
RE_Node* new_node;
|
|
|
|
state = safe_state->re_state;
|
|
fuzzy_info = &state->fuzzy_info;
|
|
values = fuzzy_info->node->values;
|
|
|
|
bt_data = state->backtrack;
|
|
new_text_pos = bt_data->fuzzy_string.position.text_pos;
|
|
new_node = bt_data->fuzzy_string.position.node;
|
|
new_group_pos = bt_data->fuzzy_string.string_pos;
|
|
data.new_folded_pos = bt_data->fuzzy_string.folded_pos;
|
|
data.folded_len = bt_data->fuzzy_string.folded_len;
|
|
data.new_gfolded_pos = bt_data->fuzzy_string.gfolded_pos;
|
|
data.fuzzy_type = bt_data->fuzzy_string.fuzzy_type;
|
|
data.step = bt_data->fuzzy_string.step;
|
|
|
|
--fuzzy_info->counts[data.fuzzy_type];
|
|
--fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
--state->total_errors;
|
|
state->total_cost -= values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
/* Permit insertion except initially when searching (it's better just to
|
|
* start searching one character later).
|
|
*/
|
|
data.permit_insertion = !search || new_text_pos != state->search_anchor ||
|
|
data.new_folded_pos != bt_data->fuzzy_string.folded_len;
|
|
|
|
for (++data.fuzzy_type; data.fuzzy_type < RE_FUZZY_COUNT;
|
|
data.fuzzy_type++) {
|
|
int status;
|
|
|
|
status = next_fuzzy_match_group_fld(state, &data);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
goto found;
|
|
}
|
|
|
|
discard_backtrack(state);
|
|
*matched = FALSE;
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
found:
|
|
bt_data->fuzzy_string.fuzzy_type = (RE_INT8)data.fuzzy_type;
|
|
|
|
++fuzzy_info->counts[data.fuzzy_type];
|
|
++fuzzy_info->counts[RE_FUZZY_ERR];
|
|
fuzzy_info->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
++state->total_errors;
|
|
state->total_cost += values[RE_FUZZY_VAL_COST_BASE + data.fuzzy_type];
|
|
|
|
*text_pos = new_text_pos;
|
|
*node = new_node;
|
|
*group_pos = new_group_pos;
|
|
*folded_pos = data.new_folded_pos;
|
|
*gfolded_pos = data.new_gfolded_pos;
|
|
*matched = TRUE;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Locates the required string, if there's one. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) locate_required_string(RE_SafeState* safe_state) {
|
|
RE_State* state;
|
|
PatternObject* pattern;
|
|
Py_ssize_t found_pos;
|
|
Py_ssize_t end_pos;
|
|
|
|
state = safe_state->re_state;
|
|
pattern = state->pattern;
|
|
|
|
/* We haven't matched the required string yet. */
|
|
state->req_pos = -1;
|
|
|
|
if (!pattern->req_string)
|
|
/* There isn't a required string, so start matching from the current
|
|
* position.
|
|
*/
|
|
return state->text_pos;
|
|
|
|
/* Search for the required string and calculate where to start matching. */
|
|
switch (pattern->req_string->op) {
|
|
case RE_OP_STRING:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
found_pos = string_search(safe_state, pattern->req_string,
|
|
state->text_pos, state->slice_end, &is_partial);
|
|
if (found_pos < 0)
|
|
/* The required string wasn't found. */
|
|
return -1;
|
|
|
|
if (is_partial)
|
|
/* We found a partial match, so start matching from there. */
|
|
return found_pos;
|
|
|
|
/* Record where the required string matched. */
|
|
state->req_pos = found_pos;
|
|
state->req_end = found_pos +
|
|
(Py_ssize_t)pattern->req_string->value_count;
|
|
|
|
if (pattern->req_offset >= 0) {
|
|
/* Step back from the required string to where we should start
|
|
* matching.
|
|
*/
|
|
found_pos -= pattern->req_offset;
|
|
if (found_pos >= state->text_pos)
|
|
return found_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
found_pos = string_search_fld(safe_state, pattern->req_string,
|
|
state->text_pos, state->slice_end, &end_pos, &is_partial);
|
|
if (found_pos < 0)
|
|
/* The required string wasn't found. */
|
|
return -1;
|
|
|
|
if (is_partial)
|
|
/* We found a partial match, so start matching from there. */
|
|
return found_pos;
|
|
|
|
/* Record where the required string matched. */
|
|
state->req_pos = found_pos;
|
|
state->req_end = end_pos;
|
|
|
|
if (pattern->req_offset >= 0) {
|
|
/* Step back from the required string to where we should start
|
|
* matching.
|
|
*/
|
|
found_pos -= pattern->req_offset;
|
|
if (found_pos >= state->text_pos)
|
|
return found_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD_REV:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
found_pos = string_search_fld_rev(safe_state, pattern->req_string,
|
|
state->text_pos, state->slice_start, &end_pos, &is_partial);
|
|
if (found_pos < 0)
|
|
/* The required string wasn't found. */
|
|
return -1;
|
|
|
|
if (is_partial)
|
|
/* We found a partial match, so start matching from there. */
|
|
return found_pos;
|
|
|
|
/* Record where the required string matched. */
|
|
state->req_pos = found_pos;
|
|
state->req_end = end_pos;
|
|
|
|
if (pattern->req_offset >= 0) {
|
|
/* Step back from the required string to where we should start
|
|
* matching.
|
|
*/
|
|
found_pos += pattern->req_offset;
|
|
if (found_pos <= state->text_pos)
|
|
return found_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
found_pos = string_search_ign(safe_state, pattern->req_string,
|
|
state->text_pos, state->slice_end, &is_partial);
|
|
if (found_pos < 0)
|
|
/* The required string wasn't found. */
|
|
return -1;
|
|
|
|
if (is_partial)
|
|
/* We found a partial match, so start matching from there. */
|
|
return found_pos;
|
|
|
|
/* Record where the required string matched. */
|
|
state->req_pos = found_pos;
|
|
state->req_end = found_pos +
|
|
(Py_ssize_t)pattern->req_string->value_count;
|
|
|
|
if (pattern->req_offset >= 0) {
|
|
/* Step back from the required string to where we should start
|
|
* matching.
|
|
*/
|
|
found_pos -= pattern->req_offset;
|
|
if (found_pos >= state->text_pos)
|
|
return found_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN_REV:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
found_pos = string_search_ign_rev(safe_state, pattern->req_string,
|
|
state->text_pos, state->slice_start, &is_partial);
|
|
if (found_pos < 0)
|
|
/* The required string wasn't found. */
|
|
return -1;
|
|
|
|
if (is_partial)
|
|
/* We found a partial match, so start matching from there. */
|
|
return found_pos;
|
|
|
|
/* Record where the required string matched. */
|
|
state->req_pos = found_pos;
|
|
state->req_end = found_pos -
|
|
(Py_ssize_t)pattern->req_string->value_count;
|
|
|
|
if (pattern->req_offset >= 0) {
|
|
/* Step back from the required string to where we should start
|
|
* matching.
|
|
*/
|
|
found_pos += pattern->req_offset;
|
|
if (found_pos <= state->text_pos)
|
|
return found_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_REV:
|
|
{
|
|
BOOL is_partial;
|
|
|
|
found_pos = string_search_rev(safe_state, pattern->req_string,
|
|
state->text_pos, state->slice_start, &is_partial);
|
|
if (found_pos < 0)
|
|
/* The required string wasn't found. */
|
|
return -1;
|
|
|
|
if (is_partial)
|
|
/* We found a partial match, so start matching from there. */
|
|
return found_pos;
|
|
|
|
/* Record where the required string matched. */
|
|
state->req_pos = found_pos;
|
|
state->req_end = found_pos -
|
|
(Py_ssize_t)pattern->req_string->value_count;
|
|
|
|
if (pattern->req_offset >= 0) {
|
|
/* Step back from the required string to where we should start
|
|
* matching.
|
|
*/
|
|
found_pos += pattern->req_offset;
|
|
if (found_pos <= state->text_pos)
|
|
return found_pos;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Start matching from the current position. */
|
|
return state->text_pos;
|
|
}
|
|
|
|
/* Tries to match a character pattern. */
|
|
Py_LOCAL_INLINE(int) match_one(RE_State* state, RE_Node* node, Py_ssize_t
|
|
text_pos) {
|
|
switch (node->op) {
|
|
case RE_OP_ANY:
|
|
return try_match_ANY(state, node, text_pos);
|
|
case RE_OP_ANY_ALL:
|
|
return try_match_ANY_ALL(state, node, text_pos);
|
|
case RE_OP_ANY_ALL_REV:
|
|
return try_match_ANY_ALL_REV(state, node, text_pos);
|
|
case RE_OP_ANY_REV:
|
|
return try_match_ANY_REV(state, node, text_pos);
|
|
case RE_OP_ANY_U:
|
|
return try_match_ANY_U(state, node, text_pos);
|
|
case RE_OP_ANY_U_REV:
|
|
return try_match_ANY_U_REV(state, node, text_pos);
|
|
case RE_OP_CHARACTER:
|
|
return try_match_CHARACTER(state, node, text_pos);
|
|
case RE_OP_CHARACTER_IGN:
|
|
return try_match_CHARACTER_IGN(state, node, text_pos);
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
return try_match_CHARACTER_IGN_REV(state, node, text_pos);
|
|
case RE_OP_CHARACTER_REV:
|
|
return try_match_CHARACTER_REV(state, node, text_pos);
|
|
case RE_OP_PROPERTY:
|
|
return try_match_PROPERTY(state, node, text_pos);
|
|
case RE_OP_PROPERTY_IGN:
|
|
return try_match_PROPERTY_IGN(state, node, text_pos);
|
|
case RE_OP_PROPERTY_IGN_REV:
|
|
return try_match_PROPERTY_IGN_REV(state, node, text_pos);
|
|
case RE_OP_PROPERTY_REV:
|
|
return try_match_PROPERTY_REV(state, node, text_pos);
|
|
case RE_OP_RANGE:
|
|
return try_match_RANGE(state, node, text_pos);
|
|
case RE_OP_RANGE_IGN:
|
|
return try_match_RANGE_IGN(state, node, text_pos);
|
|
case RE_OP_RANGE_IGN_REV:
|
|
return try_match_RANGE_IGN_REV(state, node, text_pos);
|
|
case RE_OP_RANGE_REV:
|
|
return try_match_RANGE_REV(state, node, text_pos);
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_UNION:
|
|
return try_match_SET(state, node, text_pos);
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_UNION_IGN:
|
|
return try_match_SET_IGN(state, node, text_pos);
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
return try_match_SET_IGN_REV(state, node, text_pos);
|
|
case RE_OP_SET_DIFF_REV:
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
return try_match_SET_REV(state, node, text_pos);
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Performs a depth-first match or search from the context. */
|
|
Py_LOCAL_INLINE(int) basic_match(RE_SafeState* safe_state, RE_Node* start_node,
|
|
BOOL search, BOOL recursive_call) {
|
|
RE_State* state;
|
|
RE_EncodingTable* encoding;
|
|
PatternObject* pattern;
|
|
RE_NextNode start_pair;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
Py_ssize_t pattern_step; /* The overall step of the pattern (forwards or backwards). */
|
|
Py_ssize_t string_pos;
|
|
BOOL do_search_start;
|
|
Py_ssize_t found_pos;
|
|
int folded_pos;
|
|
int gfolded_pos;
|
|
RE_Node* node;
|
|
int status;
|
|
TRACE(("<<basic_match>>\n"))
|
|
|
|
state = safe_state->re_state;
|
|
encoding = state->encoding;
|
|
pattern = state->pattern;
|
|
|
|
/* Look beyond any initial group node. */
|
|
start_pair.node = start_node;
|
|
if (recursive_call)
|
|
start_pair.test = locate_test_start(start_node);
|
|
else
|
|
start_pair.test = pattern->start_test;
|
|
|
|
/* Is the pattern anchored to the start or end of the string? */
|
|
switch (start_pair.test->op) {
|
|
case RE_OP_END_OF_STRING:
|
|
if (state->reverse) {
|
|
/* Searching backwards. */
|
|
if (state->text_pos != state->text_length)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
/* Don't bother to search further because it's anchored. */
|
|
search = FALSE;
|
|
}
|
|
break;
|
|
case RE_OP_START_OF_STRING:
|
|
if (!state->reverse) {
|
|
/* Searching forwards. */
|
|
if (state->text_pos != 0)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
/* Don't bother to search further because it's anchored. */
|
|
search = FALSE;
|
|
}
|
|
break;
|
|
}
|
|
|
|
char_at = state->char_at;
|
|
pattern_step = state->reverse ? -1 : 1;
|
|
string_pos = -1;
|
|
do_search_start = pattern->do_search_start;
|
|
|
|
/* Add a backtrack entry for failure. */
|
|
if (!add_backtrack(safe_state, RE_OP_FAILURE))
|
|
return RE_ERROR_BACKTRACKING;
|
|
|
|
start_match:
|
|
/* If we're searching, advance along the string until there could be a
|
|
* match.
|
|
*/
|
|
if (pattern->pattern_call_ref >= 0) {
|
|
RE_GuardList* guard_list;
|
|
|
|
guard_list = &state->group_call_guard_list[pattern->pattern_call_ref];
|
|
guard_list->count = 0;
|
|
guard_list->last_text_pos = -1;
|
|
}
|
|
|
|
/* Locate the required string, if there's one, unless this is a recursive
|
|
* call of 'basic_match'.
|
|
*/
|
|
if (!pattern->req_string || recursive_call)
|
|
found_pos = state->text_pos;
|
|
else {
|
|
found_pos = locate_required_string(safe_state);
|
|
if (found_pos < 0)
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
|
|
if (search) {
|
|
state->text_pos = found_pos;
|
|
|
|
if (do_search_start) {
|
|
RE_Position new_position;
|
|
|
|
next_match_1:
|
|
/* 'search_start' will clear 'do_search_start' if it can't perform
|
|
* a fast search for the next possible match. This enables us to
|
|
* avoid the overhead of the call subsequently.
|
|
*/
|
|
status = search_start(safe_state, &start_pair, &new_position, 0);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
node = new_position.node;
|
|
state->text_pos = new_position.text_pos;
|
|
|
|
if (node->op == RE_OP_SUCCESS) {
|
|
/* Must the match advance past its start? */
|
|
if (state->text_pos != state->search_anchor ||
|
|
!state->must_advance)
|
|
return RE_ERROR_SUCCESS;
|
|
|
|
state->text_pos = state->match_pos + pattern_step;
|
|
goto next_match_1;
|
|
}
|
|
|
|
/* 'do_search_start' may have been cleared. */
|
|
do_search_start = pattern->do_search_start;
|
|
} else {
|
|
/* Avoiding 'search_start', which we've found can't perform a fast
|
|
* search for the next possible match.
|
|
*/
|
|
node = start_node;
|
|
|
|
next_match_2:
|
|
if (state->reverse) {
|
|
if (state->text_pos < state->slice_start) {
|
|
if (state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
} else {
|
|
if (state->text_pos > state->slice_end) {
|
|
if (state-> partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
return RE_ERROR_FAILURE;
|
|
}
|
|
}
|
|
|
|
state->match_pos = state->text_pos;
|
|
|
|
if (node->op == RE_OP_SUCCESS) {
|
|
/* Must the match advance past its start? */
|
|
if (state->text_pos != state->search_anchor ||
|
|
!state->must_advance) {
|
|
BOOL success;
|
|
|
|
if (state->match_all && !recursive_call) {
|
|
/* We want to match all of the slice. */
|
|
if (state->reverse)
|
|
success = state->text_pos == state->slice_start;
|
|
else
|
|
success = state->text_pos == state->slice_end;
|
|
} else
|
|
success = TRUE;
|
|
|
|
if (success)
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
state->text_pos = state->match_pos + pattern_step;
|
|
goto next_match_2;
|
|
}
|
|
}
|
|
} else {
|
|
/* The start position is anchored to the current position. */
|
|
if (found_pos != state->text_pos)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
node = start_node;
|
|
}
|
|
|
|
advance:
|
|
/* The main matching loop. */
|
|
for (;;) {
|
|
TRACE(("%d|", state->text_pos))
|
|
|
|
/* Should we abort the matching? */
|
|
++state->iterations;
|
|
|
|
if (state->iterations == 0 && safe_check_signals(safe_state))
|
|
return RE_ERROR_INTERRUPTED;
|
|
|
|
switch (node->op) {
|
|
case RE_OP_ANY: /* Any character except a newline. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_ANY(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
++state->text_pos;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_ANY_ALL: /* Any character at all. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_ANY_ALL(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
++state->text_pos;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_ANY_ALL_REV: /* Any character at all, backwards. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_ANY_ALL_REV(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
--state->text_pos;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_ANY_REV: /* Any character except a newline, backwards. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_ANY_REV(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
--state->text_pos;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_ANY_U: /* Any character except a line separator. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_ANY_U(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
++state->text_pos;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_ANY_U_REV: /* Any character except a line separator, backwards. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_ANY_U_REV(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
--state->text_pos;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_ATOMIC: /* Atomic subpattern. */
|
|
{
|
|
RE_Info info;
|
|
int status;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
if (!add_backtrack(safe_state, RE_OP_ATOMIC))
|
|
return RE_ERROR_BACKTRACKING;
|
|
state->backtrack->atomic.too_few_errors = state->too_few_errors;
|
|
state->backtrack->atomic.capture_change = state->capture_change;
|
|
|
|
/* Save the groups. */
|
|
if (!push_groups(safe_state))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
save_info(state, &info);
|
|
|
|
state->must_advance = FALSE;
|
|
|
|
status = basic_match(safe_state, node->nonstring.next_2.node,
|
|
FALSE, TRUE);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
reset_guards(state, node->values);
|
|
|
|
restore_info(state, &info);
|
|
|
|
if (status != RE_ERROR_SUCCESS)
|
|
goto backtrack;
|
|
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_BOUNDARY: /* On a word boundary. */
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->match))
|
|
|
|
status = try_match_BOUNDARY(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_BRANCH: /* 2-way branch. */
|
|
{
|
|
RE_Position next_position;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match(state, &node->next_1, state->text_pos,
|
|
&next_position);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
if (!add_backtrack(safe_state, RE_OP_BRANCH))
|
|
return RE_ERROR_BACKTRACKING;
|
|
state->backtrack->branch.position.node =
|
|
node->nonstring.next_2.node;
|
|
state->backtrack->branch.position.text_pos = state->text_pos;
|
|
|
|
node = next_position.node;
|
|
state->text_pos = next_position.text_pos;
|
|
} else
|
|
node = node->nonstring.next_2.node;
|
|
break;
|
|
}
|
|
case RE_OP_CALL_REF: /* A group call reference. */
|
|
{
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
if (!push_group_return(safe_state, NULL))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
if (!add_backtrack(safe_state, RE_OP_CALL_REF))
|
|
return RE_ERROR_BACKTRACKING;
|
|
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_CHARACTER: /* A character. */
|
|
TRACE(("%s %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0]))
|
|
|
|
if (state->text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end &&
|
|
matches_CHARACTER(encoding, node, char_at(state->text,
|
|
state->text_pos)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_CHARACTER_IGN: /* A character, ignoring case. */
|
|
TRACE(("%s %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0]))
|
|
|
|
if (state->text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end &&
|
|
matches_CHARACTER_IGN(encoding, node, char_at(state->text,
|
|
state->text_pos)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_CHARACTER_IGN_REV: /* A character, backwards, ignoring case. */
|
|
TRACE(("%s %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0]))
|
|
|
|
if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
matches_CHARACTER_IGN(encoding, node, char_at(state->text,
|
|
state->text_pos - 1)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_CHARACTER_REV: /* A character, backwards. */
|
|
TRACE(("%s %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0]))
|
|
|
|
if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
matches_CHARACTER(encoding, node, char_at(state->text,
|
|
state->text_pos - 1)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_DEFAULT_BOUNDARY: /* On a default word boundary. */
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->match))
|
|
|
|
status = try_match_DEFAULT_BOUNDARY(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_DEFAULT_END_OF_WORD: /* At the default end of a word. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_DEFAULT_END_OF_WORD(state, node,
|
|
state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_DEFAULT_START_OF_WORD: /* At the default start of a word. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_DEFAULT_START_OF_WORD(state, node,
|
|
state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_END_FUZZY: /* End of fuzzy matching. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
if (!fuzzy_insert(safe_state, state->text_pos, node))
|
|
return RE_ERROR_BACKTRACKING;
|
|
|
|
/* If there were too few errors, in the fuzzy section, try again.
|
|
*/
|
|
if (state->too_few_errors) {
|
|
state->too_few_errors = FALSE;
|
|
goto backtrack;
|
|
}
|
|
|
|
state->total_fuzzy_counts[RE_FUZZY_SUB] +=
|
|
state->fuzzy_info.counts[RE_FUZZY_SUB];
|
|
state->total_fuzzy_counts[RE_FUZZY_INS] +=
|
|
state->fuzzy_info.counts[RE_FUZZY_INS];
|
|
state->total_fuzzy_counts[RE_FUZZY_DEL] +=
|
|
state->fuzzy_info.counts[RE_FUZZY_DEL];
|
|
|
|
node = node->next_1.node;
|
|
break;
|
|
case RE_OP_END_GREEDY_REPEAT: /* End of a greedy repeat. */
|
|
{
|
|
RE_CODE index;
|
|
RE_RepeatData* rp_data;
|
|
BOOL changed;
|
|
BOOL try_body;
|
|
int body_status;
|
|
RE_Position next_body_position;
|
|
BOOL try_tail;
|
|
int tail_status;
|
|
RE_Position next_tail_position;
|
|
RE_BacktrackData* bt_data;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Repeat indexes are 0-based. */
|
|
index = node->values[0];
|
|
rp_data = &state->repeats[index];
|
|
|
|
/* The body has matched successfully at this position. */
|
|
if (!guard_repeat(safe_state, index, rp_data->start,
|
|
RE_STATUS_BODY, FALSE))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
++rp_data->count;
|
|
|
|
/* Have we advanced through the text or has a capture group change?
|
|
*/
|
|
changed = rp_data->capture_change != state->capture_change ||
|
|
state->text_pos != rp_data->start;
|
|
|
|
/* The counts are of type size_t, so the format needs to specify
|
|
* that.
|
|
*/
|
|
TRACE(("min is %" PY_FORMAT_SIZE_T "u, max is %" PY_FORMAT_SIZE_T
|
|
"u, count is %" PY_FORMAT_SIZE_T "u\n", node->values[1],
|
|
node->values[2], rp_data->count))
|
|
|
|
/* Could the body or tail match? */
|
|
try_body = changed && (rp_data->count < node->values[2] ||
|
|
~node->values[2] == 0) && !is_repeat_guarded(safe_state, index,
|
|
state->text_pos, RE_STATUS_BODY);
|
|
if (try_body) {
|
|
body_status = try_match(state, &node->next_1, state->text_pos,
|
|
&next_body_position);
|
|
|
|
if (body_status == RE_ERROR_FAILURE)
|
|
try_body = FALSE;
|
|
} else
|
|
body_status = RE_ERROR_FAILURE;
|
|
|
|
try_tail = (!changed || rp_data->count >= node->values[1]) &&
|
|
!is_repeat_guarded(safe_state, index, state->text_pos,
|
|
RE_STATUS_TAIL);
|
|
if(try_tail) {
|
|
tail_status = try_match(state, &node->nonstring.next_2,
|
|
state->text_pos, &next_tail_position);
|
|
|
|
if (tail_status == RE_ERROR_FAILURE)
|
|
try_tail = FALSE;
|
|
} else
|
|
tail_status = RE_ERROR_FAILURE;
|
|
|
|
if (!try_body && !try_tail) {
|
|
/* Neither the body nor the tail could match. */
|
|
--rp_data->count;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (body_status < 0 || (body_status == 0 && tail_status < 0))
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
/* Record info in case we backtrack into the body. */
|
|
if (!add_backtrack(safe_state, RE_OP_BODY_END))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.count = rp_data->count - 1;
|
|
bt_data->repeat.start = rp_data->start;
|
|
bt_data->repeat.capture_change = rp_data->capture_change;
|
|
|
|
if (try_body) {
|
|
/* Both the body and the tail could match. */
|
|
if (try_tail) {
|
|
/* The body takes precedence. If the body fails to match
|
|
* then we want to try the tail before backtracking
|
|
* further.
|
|
*/
|
|
|
|
/* Record backtracking info for matching the tail. */
|
|
if (!add_backtrack(safe_state, RE_OP_MATCH_TAIL))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.position = next_tail_position;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.count = rp_data->count;
|
|
bt_data->repeat.start = rp_data->start;
|
|
bt_data->repeat.capture_change = rp_data->capture_change;
|
|
bt_data->repeat.text_pos = state->text_pos;
|
|
}
|
|
|
|
/* Record backtracking info in case the body fails to match. */
|
|
if (!add_backtrack(safe_state, RE_OP_BODY_START))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.text_pos = state->text_pos;
|
|
|
|
rp_data->capture_change = state->capture_change;
|
|
rp_data->start = state->text_pos;
|
|
|
|
/* Advance into the body. */
|
|
node = next_body_position.node;
|
|
state->text_pos = next_body_position.text_pos;
|
|
} else {
|
|
/* Only the tail could match. */
|
|
|
|
/* Advance into the tail. */
|
|
node = next_tail_position.node;
|
|
state->text_pos = next_tail_position.text_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_END_GROUP: /* End of a capture group. */
|
|
{
|
|
RE_CODE private_index;
|
|
RE_CODE public_index;
|
|
RE_GroupData* group;
|
|
RE_BacktrackData* bt_data;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[1]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*/
|
|
private_index = node->values[0];
|
|
public_index = node->values[1];
|
|
group = &state->groups[private_index - 1];
|
|
|
|
if (!add_backtrack(safe_state, RE_OP_END_GROUP))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->group.private_index = private_index;
|
|
bt_data->group.public_index = public_index;
|
|
bt_data->group.text_pos = group->span.end;
|
|
bt_data->group.capture = (BOOL)node->values[2];
|
|
bt_data->group.current_capture = group->current_capture;
|
|
|
|
if (pattern->group_info[private_index - 1].referenced &&
|
|
group->span.end != state->text_pos)
|
|
++state->capture_change;
|
|
group->span.end = state->text_pos;
|
|
|
|
/* Save the capture? */
|
|
if (node->values[2]) {
|
|
group->current_capture = (Py_ssize_t)group->capture_count;
|
|
if (!save_capture(safe_state, private_index, public_index))
|
|
return RE_ERROR_MEMORY;
|
|
}
|
|
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_END_LAZY_REPEAT: /* End of a lazy repeat. */
|
|
{
|
|
RE_CODE index;
|
|
RE_RepeatData* rp_data;
|
|
BOOL changed;
|
|
BOOL try_body;
|
|
int body_status;
|
|
RE_Position next_body_position;
|
|
BOOL try_tail;
|
|
int tail_status;
|
|
RE_Position next_tail_position;
|
|
RE_BacktrackData* bt_data;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Repeat indexes are 0-based. */
|
|
index = node->values[0];
|
|
rp_data = &state->repeats[index];
|
|
|
|
/* The body has matched successfully at this position. */
|
|
if (!guard_repeat(safe_state, index, rp_data->start,
|
|
RE_STATUS_BODY, FALSE))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
++rp_data->count;
|
|
|
|
/* Have we advanced through the text or has a capture group change?
|
|
*/
|
|
changed = rp_data->capture_change != state->capture_change ||
|
|
state->text_pos != rp_data->start;
|
|
|
|
/* The counts are of type size_t, so the format needs to specify
|
|
* that.
|
|
*/
|
|
TRACE(("min is %" PY_FORMAT_SIZE_T "u, max is %" PY_FORMAT_SIZE_T
|
|
"u, count is %" PY_FORMAT_SIZE_T "u\n", node->values[1],
|
|
node->values[2], rp_data->count))
|
|
|
|
/* Could the body or tail match? */
|
|
try_body = changed && (rp_data->count < node->values[2] ||
|
|
~node->values[2] == 0) && !is_repeat_guarded(safe_state, index,
|
|
state->text_pos, RE_STATUS_BODY);
|
|
if (try_body) {
|
|
body_status = try_match(state, &node->next_1, state->text_pos,
|
|
&next_body_position);
|
|
|
|
if (body_status == RE_ERROR_FAILURE)
|
|
try_body = FALSE;
|
|
} else
|
|
body_status = RE_ERROR_FAILURE;
|
|
|
|
try_tail = (!changed || rp_data->count >= node->values[1]);
|
|
if (try_tail) {
|
|
tail_status = try_match(state, &node->nonstring.next_2,
|
|
state->text_pos, &next_tail_position);
|
|
|
|
if (tail_status == RE_ERROR_FAILURE)
|
|
try_tail = FALSE;
|
|
} else
|
|
tail_status = RE_ERROR_FAILURE;
|
|
|
|
if (!try_body && !try_tail) {
|
|
/* Neither the body nor the tail could match. */
|
|
--rp_data->count;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (body_status < 0 || (body_status == 0 && tail_status < 0))
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
/* Record info in case we backtrack into the body. */
|
|
if (!add_backtrack(safe_state, RE_OP_BODY_END))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.count = rp_data->count - 1;
|
|
bt_data->repeat.start = rp_data->start;
|
|
bt_data->repeat.capture_change = rp_data->capture_change;
|
|
|
|
if (try_body) {
|
|
/* Both the body and the tail could match. */
|
|
if (try_tail) {
|
|
/* The tail takes precedence. If the tail fails to match
|
|
* then we want to try the body before backtracking
|
|
* further.
|
|
*/
|
|
|
|
/* Record backtracking info for matching the body. */
|
|
if (!add_backtrack(safe_state, RE_OP_MATCH_BODY))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.position = next_body_position;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.count = rp_data->count;
|
|
bt_data->repeat.start = rp_data->start;
|
|
bt_data->repeat.capture_change = rp_data->capture_change;
|
|
bt_data->repeat.text_pos = state->text_pos;
|
|
|
|
/* Advance into the tail. */
|
|
node = next_tail_position.node;
|
|
state->text_pos = next_tail_position.text_pos;
|
|
} else {
|
|
/* Only the body could match. */
|
|
|
|
/* Record backtracking info in case the body fails to
|
|
* match.
|
|
*/
|
|
if (!add_backtrack(safe_state, RE_OP_BODY_START))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.text_pos = state->text_pos;
|
|
|
|
rp_data->capture_change = state->capture_change;
|
|
rp_data->start = state->text_pos;
|
|
|
|
/* Advance into the body. */
|
|
node = next_body_position.node;
|
|
state->text_pos = next_body_position.text_pos;
|
|
}
|
|
} else {
|
|
/* Only the tail could match. */
|
|
|
|
/* Advance into the tail. */
|
|
node = next_tail_position.node;
|
|
state->text_pos = next_tail_position.text_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_END_OF_LINE: /* At the end of a line. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_END_OF_LINE(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_END_OF_LINE_U: /* At the end of a line. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_END_OF_LINE_U(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_END_OF_STRING: /* At the end of the string. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_END_OF_STRING(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_END_OF_STRING_LINE: /* At end of string or final newline. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_END_OF_STRING_LINE(state, node,
|
|
state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_END_OF_STRING_LINE_U: /* At end of string or final newline. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_END_OF_STRING_LINE_U(state, node,
|
|
state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_END_OF_WORD: /* At the end of a word. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_END_OF_WORD(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_FUZZY: /* Fuzzy matching. */
|
|
{
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
RE_BacktrackData* bt_data;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
fuzzy_info = &state->fuzzy_info;
|
|
|
|
/* Save the current fuzzy info. */
|
|
if (!add_backtrack(safe_state, RE_OP_FUZZY))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
memmove(&bt_data->fuzzy.fuzzy_info, fuzzy_info,
|
|
sizeof(RE_FuzzyInfo));
|
|
bt_data->fuzzy.index = node->values[0];
|
|
bt_data->fuzzy.text_pos = state->text_pos;
|
|
|
|
/* Initialise the new fuzzy info. */
|
|
memset(fuzzy_info->counts, 0, 4 * sizeof(fuzzy_info->counts[0]));
|
|
fuzzy_info->total_cost = 0;
|
|
fuzzy_info->node = node;
|
|
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_GRAPHEME_BOUNDARY: /* On a grapheme boundary. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_GRAPHEME_BOUNDARY(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_GREEDY_REPEAT: /* Greedy repeat. */
|
|
{
|
|
RE_CODE index;
|
|
RE_RepeatData* rp_data;
|
|
RE_BacktrackData* bt_data;
|
|
BOOL try_body;
|
|
int body_status;
|
|
RE_Position next_body_position;
|
|
BOOL try_tail;
|
|
int tail_status;
|
|
RE_Position next_tail_position;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Repeat indexes are 0-based. */
|
|
index = node->values[0];
|
|
rp_data = &state->repeats[index];
|
|
|
|
/* We might need to backtrack into the head, so save the current
|
|
* repeat.
|
|
*/
|
|
if (!add_backtrack(safe_state, RE_OP_GREEDY_REPEAT))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.count = rp_data->count;
|
|
bt_data->repeat.start = rp_data->start;
|
|
bt_data->repeat.capture_change = rp_data->capture_change;
|
|
bt_data->repeat.text_pos = state->text_pos;
|
|
|
|
/* Initialise the new repeat. */
|
|
rp_data->count = 0;
|
|
rp_data->start = state->text_pos;
|
|
rp_data->capture_change = state->capture_change;
|
|
|
|
/* Could the body or tail match? */
|
|
try_body = node->values[2] > 0 && !is_repeat_guarded(safe_state,
|
|
index, state->text_pos, RE_STATUS_BODY);
|
|
if (try_body) {
|
|
body_status = try_match(state, &node->next_1, state->text_pos,
|
|
&next_body_position);
|
|
|
|
if (body_status == RE_ERROR_FAILURE)
|
|
try_body = FALSE;
|
|
} else
|
|
body_status = RE_ERROR_FAILURE;
|
|
|
|
try_tail = node->values[1] == 0;
|
|
if (try_tail) {
|
|
tail_status = try_match(state, &node->nonstring.next_2,
|
|
state->text_pos, &next_tail_position);
|
|
|
|
if (tail_status == RE_ERROR_FAILURE)
|
|
try_tail = FALSE;
|
|
} else
|
|
tail_status = RE_ERROR_FAILURE;
|
|
if (!try_body && !try_tail)
|
|
/* Neither the body nor the tail could match. */
|
|
goto backtrack;
|
|
|
|
if (body_status < 0 || (body_status == 0 && tail_status < 0))
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (try_body) {
|
|
if (try_tail) {
|
|
/* Both the body and the tail could match, but the body
|
|
* takes precedence. If the body fails to match then we
|
|
* want to try the tail before backtracking further.
|
|
*/
|
|
|
|
/* Record backtracking info for matching the tail. */
|
|
if (!add_backtrack(safe_state, RE_OP_MATCH_TAIL))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.position = next_tail_position;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.count = rp_data->count;
|
|
bt_data->repeat.start = rp_data->start;
|
|
bt_data->repeat.capture_change = rp_data->capture_change;
|
|
bt_data->repeat.text_pos = state->text_pos;
|
|
}
|
|
|
|
/* Advance into the body. */
|
|
node = next_body_position.node;
|
|
state->text_pos = next_body_position.text_pos;
|
|
} else {
|
|
/* Only the tail could match. */
|
|
|
|
/* Advance into the tail. */
|
|
node = next_tail_position.node;
|
|
state->text_pos = next_tail_position.text_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_GREEDY_REPEAT_ONE: /* Greedy repeat for one character. */
|
|
{
|
|
RE_CODE index;
|
|
RE_RepeatData* rp_data;
|
|
size_t count;
|
|
BOOL is_partial;
|
|
BOOL match;
|
|
RE_BacktrackData* bt_data;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Repeat indexes are 0-based. */
|
|
index = node->values[0];
|
|
rp_data = &state->repeats[index];
|
|
|
|
if (is_repeat_guarded(safe_state, index, state->text_pos,
|
|
RE_STATUS_BODY))
|
|
goto backtrack;
|
|
|
|
/* Count how many times the character repeats, up to the maximum.
|
|
*/
|
|
count = count_one(state, node->nonstring.next_2.node,
|
|
state->text_pos, node->values[2], &is_partial);
|
|
if (is_partial) {
|
|
state->text_pos += (Py_ssize_t)count * node->step;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
/* Unmatch until it's not guarded. */
|
|
match = FALSE;
|
|
for (;;) {
|
|
if (count < node->values[1])
|
|
/* The number of repeats is below the minimum. */
|
|
break;
|
|
|
|
if (!is_repeat_guarded(safe_state, index, state->text_pos +
|
|
(Py_ssize_t)count * node->step, RE_STATUS_TAIL)) {
|
|
/* It's not guarded at this position. */
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (count == 0)
|
|
break;
|
|
|
|
--count;
|
|
}
|
|
|
|
if (!match) {
|
|
/* The repeat has failed to match at this position. */
|
|
if (!guard_repeat(safe_state, index, state->text_pos,
|
|
RE_STATUS_BODY, TRUE))
|
|
return RE_ERROR_MEMORY;
|
|
goto backtrack;
|
|
}
|
|
|
|
/* Record the backtracking info. */
|
|
if (!add_backtrack(safe_state, RE_OP_GREEDY_REPEAT_ONE))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.position.node = node;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.text_pos = rp_data->start;
|
|
bt_data->repeat.count = rp_data->count;
|
|
|
|
rp_data->start = state->text_pos;
|
|
rp_data->count = count;
|
|
|
|
/* Advance into the tail. */
|
|
state->text_pos += (Py_ssize_t)count * node->step;
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_GROUP_CALL: /* Group call. */
|
|
{
|
|
size_t index;
|
|
size_t g;
|
|
size_t r;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
index = node->values[0];
|
|
|
|
/* Save the capture groups and repeat guards. */
|
|
if (!push_group_return(safe_state, node->next_1.node))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Clear the capture groups for the group call. They'll be restored
|
|
* on return.
|
|
*/
|
|
for (g = 0; g < state->pattern->true_group_count; g++) {
|
|
RE_GroupData* group;
|
|
|
|
group = &state->groups[g];
|
|
group->span.start = -1;
|
|
group->span.end = -1;
|
|
group->current_capture = -1;
|
|
}
|
|
|
|
/* Clear the repeat guards for the group call. They'll be restored
|
|
* on return.
|
|
*/
|
|
for (r = 0; r < state->pattern->repeat_count; r++) {
|
|
RE_RepeatData* repeat;
|
|
|
|
repeat = &state->repeats[r];
|
|
repeat->body_guard_list.count = 0;
|
|
repeat->body_guard_list.last_text_pos = -1;
|
|
repeat->tail_guard_list.count = 0;
|
|
repeat->tail_guard_list.last_text_pos = -1;
|
|
}
|
|
|
|
/* Call a group, skipping its CALL_REF node. */
|
|
node = pattern->call_ref_info[index].node->next_1.node;
|
|
|
|
if (!add_backtrack(safe_state, RE_OP_GROUP_CALL))
|
|
return RE_ERROR_BACKTRACKING;
|
|
break;
|
|
}
|
|
case RE_OP_GROUP_EXISTS: /* Capture group exists. */
|
|
{
|
|
RE_GroupData* group;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*
|
|
* Check whether the captured text, if any, exists at this position
|
|
* in the string.
|
|
*/
|
|
group = &state->groups[node->values[0] - 1];
|
|
if (group->current_capture >= 0)
|
|
node = node->next_1.node;
|
|
else
|
|
node = node->nonstring.next_2.node;
|
|
break;
|
|
}
|
|
case RE_OP_GROUP_RETURN: /* Group return. */
|
|
{
|
|
RE_Node* return_node;
|
|
RE_BacktrackData* bt_data;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
return_node = top_group_return(state);
|
|
|
|
if (!add_backtrack(safe_state, RE_OP_GROUP_RETURN))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->group_call.node = return_node;
|
|
bt_data->group_call.capture_change = state->capture_change;
|
|
|
|
if (return_node) {
|
|
/* The group was called. */
|
|
node = return_node;
|
|
|
|
/* Save the groups. */
|
|
if (!push_groups(safe_state))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Save the repeats. */
|
|
if (!push_repeats(safe_state))
|
|
return RE_ERROR_MEMORY;
|
|
} else
|
|
/* The group was not called. */
|
|
node = node->next_1.node;
|
|
|
|
pop_group_return(state);
|
|
break;
|
|
}
|
|
case RE_OP_LAZY_REPEAT: /* Lazy repeat. */
|
|
{
|
|
RE_CODE index;
|
|
RE_RepeatData* rp_data;
|
|
RE_BacktrackData* bt_data;
|
|
BOOL try_body;
|
|
int body_status;
|
|
RE_Position next_body_position;
|
|
BOOL try_tail;
|
|
int tail_status;
|
|
RE_Position next_tail_position;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Repeat indexes are 0-based. */
|
|
index = node->values[0];
|
|
rp_data = &state->repeats[index];
|
|
|
|
/* We might need to backtrack into the head, so save the current
|
|
* repeat.
|
|
*/
|
|
if (!add_backtrack(safe_state, RE_OP_LAZY_REPEAT))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.count = rp_data->count;
|
|
bt_data->repeat.start = rp_data->start;
|
|
bt_data->repeat.capture_change = rp_data->capture_change;
|
|
bt_data->repeat.text_pos = state->text_pos;
|
|
|
|
/* Initialise the new repeat. */
|
|
rp_data->count = 0;
|
|
rp_data->start = state->text_pos;
|
|
rp_data->capture_change = state->capture_change;
|
|
|
|
/* Could the body or tail match? */
|
|
try_body = node->values[2] > 0 && !is_repeat_guarded(safe_state,
|
|
index, state->text_pos, RE_STATUS_BODY);
|
|
if (try_body) {
|
|
body_status = try_match(state, &node->next_1, state->text_pos,
|
|
&next_body_position);
|
|
|
|
if (body_status == RE_ERROR_FAILURE)
|
|
try_body = FALSE;
|
|
} else
|
|
body_status = RE_ERROR_FAILURE;
|
|
|
|
try_tail = node->values[1] == 0;
|
|
if(try_tail) {
|
|
tail_status = try_match(state, &node->nonstring.next_2,
|
|
state->text_pos, &next_tail_position);
|
|
|
|
if (tail_status == RE_ERROR_FAILURE)
|
|
try_tail = FALSE;
|
|
} else
|
|
tail_status = RE_ERROR_FAILURE;
|
|
|
|
if (!try_body && !try_tail)
|
|
/* Neither the body nor the tail could match. */
|
|
goto backtrack;
|
|
|
|
if (body_status < 0 || (body_status == 0 && tail_status < 0))
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (try_body) {
|
|
if (try_tail) {
|
|
/* Both the body and the tail could match, but the tail
|
|
* takes precedence. If the tail fails to match then we
|
|
* want to try the body before backtracking further.
|
|
*/
|
|
|
|
/* Record backtracking info for matching the tail. */
|
|
if (!add_backtrack(safe_state, RE_OP_MATCH_BODY))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.position = next_body_position;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.count = rp_data->count;
|
|
bt_data->repeat.start = rp_data->start;
|
|
bt_data->repeat.capture_change = rp_data->capture_change;
|
|
bt_data->repeat.text_pos = state->text_pos;
|
|
|
|
/* Advance into the tail. */
|
|
node = next_tail_position.node;
|
|
state->text_pos = next_tail_position.text_pos;
|
|
} else {
|
|
/* Advance into the body. */
|
|
node = next_body_position.node;
|
|
state->text_pos = next_body_position.text_pos;
|
|
}
|
|
} else {
|
|
/* Only the tail could match. */
|
|
|
|
/* Advance into the tail. */
|
|
node = next_tail_position.node;
|
|
state->text_pos = next_tail_position.text_pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_LAZY_REPEAT_ONE: /* Lazy repeat for one character. */
|
|
{
|
|
RE_CODE index;
|
|
RE_RepeatData* rp_data;
|
|
size_t count;
|
|
BOOL is_partial;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Repeat indexes are 0-based. */
|
|
index = node->values[0];
|
|
rp_data = &state->repeats[index];
|
|
|
|
if (is_repeat_guarded(safe_state, index, state->text_pos,
|
|
RE_STATUS_BODY))
|
|
goto backtrack;
|
|
|
|
/* Count how many times the character repeats, up to the minimum.
|
|
*/
|
|
count = count_one(state, node->nonstring.next_2.node,
|
|
state->text_pos, node->values[1], &is_partial);
|
|
if (is_partial) {
|
|
state->text_pos += (Py_ssize_t)count * node->step;
|
|
return RE_ERROR_PARTIAL;
|
|
}
|
|
|
|
/* Have we matched at least the minimum? */
|
|
if (count < node->values[1]) {
|
|
/* The repeat has failed to match at this position. */
|
|
if (!guard_repeat(safe_state, index, state->text_pos,
|
|
RE_STATUS_BODY, TRUE))
|
|
return RE_ERROR_MEMORY;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (count < node->values[2]) {
|
|
/* The match is shorter than the maximum, so we might need to
|
|
* backtrack the repeat to consume more.
|
|
*/
|
|
RE_BacktrackData* bt_data;
|
|
|
|
/* Get the offset to the repeat values in the context. */
|
|
rp_data = &state->repeats[index];
|
|
if (!add_backtrack(safe_state, RE_OP_LAZY_REPEAT_ONE))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->repeat.position.node = node;
|
|
bt_data->repeat.index = index;
|
|
bt_data->repeat.text_pos = rp_data->start;
|
|
bt_data->repeat.count = rp_data->count;
|
|
|
|
rp_data->start = state->text_pos;
|
|
rp_data->count = count;
|
|
}
|
|
|
|
/* Advance into the tail. */
|
|
state->text_pos += (Py_ssize_t)count * node->step;
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_LOOKAROUND: /* Lookaround. */
|
|
{
|
|
RE_Info info;
|
|
size_t capture_change;
|
|
Py_ssize_t saved_slice_start;
|
|
Py_ssize_t saved_slice_end;
|
|
Py_ssize_t saved_text_pos;
|
|
BOOL too_few_errors;
|
|
int status;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->match))
|
|
|
|
/* Save the groups. */
|
|
if (!push_groups(safe_state))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
capture_change = state->capture_change;
|
|
|
|
/* Save the other info. */
|
|
save_info(state, &info);
|
|
|
|
saved_slice_start = state->slice_start;
|
|
saved_slice_end = state->slice_end;
|
|
saved_text_pos = state->text_pos;
|
|
state->slice_start = 0;
|
|
state->slice_end = state->text_length;
|
|
state->must_advance = FALSE;
|
|
|
|
too_few_errors = state->too_few_errors;
|
|
|
|
status = basic_match(safe_state, node->nonstring.next_2.node,
|
|
FALSE, TRUE);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
reset_guards(state, node->values + 1);
|
|
|
|
state->text_pos = saved_text_pos;
|
|
state->slice_end = saved_slice_end;
|
|
state->slice_start = saved_slice_start;
|
|
|
|
/* Restore the other info. */
|
|
restore_info(state, &info);
|
|
|
|
if (node->match) {
|
|
/* It's a positive lookaround. */
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
/* It succeeded, so the groups and certain flags may have
|
|
* changed.
|
|
*/
|
|
if (!add_backtrack(safe_state, RE_OP_LOOKAROUND))
|
|
return RE_ERROR_BACKTRACKING;
|
|
|
|
/* We'll restore the groups and flags on backtracking. */
|
|
state->backtrack->lookaround.too_few_errors =
|
|
too_few_errors;
|
|
state->backtrack->lookaround.capture_change =
|
|
capture_change;
|
|
} else {
|
|
/* It failed, so the groups and certain flags haven't
|
|
* changed.
|
|
*/
|
|
drop_groups(state);
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
/* It's a negative lookaround. */
|
|
if (status == RE_ERROR_SUCCESS) {
|
|
/* It succeeded, so the groups and certain flags may have
|
|
* changed. We need to restore them.
|
|
*/
|
|
pop_groups(state);
|
|
state->too_few_errors = too_few_errors;
|
|
state->capture_change = capture_change;
|
|
goto backtrack;
|
|
} else
|
|
/* It failed, so the groups and certain flags haven't
|
|
* changed.
|
|
*/
|
|
drop_groups(state);
|
|
}
|
|
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_PROPERTY: /* A property. */
|
|
TRACE(("%s %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0]))
|
|
|
|
if (state->text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end &&
|
|
matches_PROPERTY(encoding, node, char_at(state->text,
|
|
state->text_pos)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_PROPERTY_IGN: /* A property, ignoring case. */
|
|
TRACE(("%s %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0]))
|
|
|
|
if (state->text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end &&
|
|
matches_PROPERTY_IGN(encoding, node, char_at(state->text,
|
|
state->text_pos)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_PROPERTY_IGN_REV: /* A property, backwards, ignoring case. */
|
|
TRACE(("%s %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0]))
|
|
|
|
if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
matches_PROPERTY_IGN(encoding, node, char_at(state->text,
|
|
state->text_pos - 1)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_PROPERTY_REV: /* A property, backwards. */
|
|
TRACE(("%s %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0]))
|
|
|
|
if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
matches_PROPERTY(encoding, node, char_at(state->text,
|
|
state->text_pos - 1)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_RANGE: /* A range. */
|
|
TRACE(("%s %d %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0], node->values[1]))
|
|
|
|
if (state->text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end && matches_RANGE(encoding,
|
|
node, char_at(state->text, state->text_pos)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_RANGE_IGN: /* A range, ignoring case. */
|
|
TRACE(("%s %d %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0], node->values[1]))
|
|
|
|
if (state->text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end &&
|
|
matches_RANGE_IGN(encoding, node, char_at(state->text,
|
|
state->text_pos)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_RANGE_IGN_REV: /* A range, backwards, ignoring case. */
|
|
TRACE(("%s %d %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0], node->values[1]))
|
|
|
|
if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
matches_RANGE_IGN(encoding, node, char_at(state->text,
|
|
state->text_pos - 1)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_RANGE_REV: /* A range, backwards. */
|
|
TRACE(("%s %d %d %d\n", re_op_text[node->op], node->match,
|
|
node->values[0], node->values[1]))
|
|
|
|
if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start && matches_RANGE(encoding,
|
|
node, char_at(state->text, state->text_pos - 1)) == node->match)
|
|
{
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_REF_GROUP: /* Reference to a capture group. */
|
|
{
|
|
RE_GroupData* group;
|
|
RE_GroupSpan* span;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*
|
|
* Check whether the captured text, if any, exists at this position
|
|
* in the string.
|
|
*/
|
|
|
|
/* Did the group capture anything? */
|
|
group = &state->groups[node->values[0] - 1];
|
|
if (group->current_capture < 0)
|
|
goto backtrack;
|
|
|
|
span = &group->captures[group->current_capture];
|
|
|
|
if (string_pos < 0)
|
|
string_pos = span->start;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos < span->end) {
|
|
if (state->text_pos >= state->text_length &&
|
|
state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end && same_char(encoding,
|
|
char_at(state->text, state->text_pos), char_at(state->text,
|
|
string_pos))) {
|
|
++string_pos;
|
|
++state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &matched, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_REF_GROUP_FLD: /* Reference to a capture group, ignoring case. */
|
|
{
|
|
RE_GroupData* group;
|
|
RE_GroupSpan* span;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
int folded_len;
|
|
int gfolded_len;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
Py_UCS4 gfolded[RE_MAX_FOLDED];
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*
|
|
* Check whether the captured text, if any, exists at this position
|
|
* in the string.
|
|
*/
|
|
|
|
/* Did the group capture anything? */
|
|
group = &state->groups[node->values[0] - 1];
|
|
if (group->current_capture < 0)
|
|
goto backtrack;
|
|
|
|
span = &group->captures[group->current_capture];
|
|
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
if (string_pos < 0) {
|
|
string_pos = span->start;
|
|
folded_pos = 0;
|
|
folded_len = 0;
|
|
gfolded_pos = 0;
|
|
gfolded_len = 0;
|
|
} else {
|
|
folded_len = full_case_fold(char_at(state->text,
|
|
state->text_pos), folded);
|
|
gfolded_len = full_case_fold(char_at(state->text, string_pos),
|
|
gfolded);
|
|
}
|
|
|
|
/* Try comparing. */
|
|
while (string_pos < span->end) {
|
|
/* Case-fold at current position in text. */
|
|
if (folded_pos >= folded_len) {
|
|
if (state->text_pos >= state->text_length &&
|
|
state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end)
|
|
folded_len = full_case_fold(char_at(state->text,
|
|
state->text_pos), folded);
|
|
else
|
|
folded_len = 0;
|
|
|
|
folded_pos = 0;
|
|
}
|
|
|
|
/* Case-fold at current position in group. */
|
|
if (gfolded_pos >= gfolded_len) {
|
|
gfolded_len = full_case_fold(char_at(state->text,
|
|
string_pos), gfolded);
|
|
gfolded_pos = 0;
|
|
}
|
|
|
|
if (folded_pos < folded_len && folded[folded_pos] ==
|
|
gfolded[gfolded_pos]) {
|
|
++folded_pos;
|
|
++gfolded_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_group_fld(safe_state, search,
|
|
&state->text_pos, node, &folded_pos, folded_len,
|
|
&string_pos, &gfolded_pos, gfolded_len, &matched, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (folded_pos >= folded_len && folded_len > 0)
|
|
++state->text_pos;
|
|
|
|
if (gfolded_pos >= gfolded_len)
|
|
++string_pos;
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
if (folded_pos < folded_len || gfolded_pos < gfolded_len)
|
|
goto backtrack;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_REF_GROUP_FLD_REV: /* Reference to a capture group, ignoring case. */
|
|
{
|
|
RE_GroupData* group;
|
|
RE_GroupSpan* span;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
int folded_len;
|
|
int gfolded_len;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
Py_UCS4 gfolded[RE_MAX_FOLDED];
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*
|
|
* Check whether the captured text, if any, exists at this position
|
|
* in the string.
|
|
*/
|
|
|
|
/* Did the group capture anything? */
|
|
group = &state->groups[node->values[0] - 1];
|
|
if (group->current_capture < 0)
|
|
goto backtrack;
|
|
|
|
span = &group->captures[group->current_capture];
|
|
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
if (string_pos < 0) {
|
|
string_pos = span->end;
|
|
folded_pos = 0;
|
|
folded_len = 0;
|
|
gfolded_pos = 0;
|
|
gfolded_len = 0;
|
|
} else {
|
|
folded_len = full_case_fold(char_at(state->text,
|
|
state->text_pos - 1), folded);
|
|
gfolded_len = full_case_fold(char_at(state->text, string_pos -
|
|
1), gfolded);
|
|
}
|
|
|
|
/* Try comparing. */
|
|
while (string_pos > span->start) {
|
|
/* Case-fold at current position in text. */
|
|
if (folded_pos <= 0) {
|
|
if (state->text_pos <= 0 && state->partial_side ==
|
|
RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start)
|
|
folded_len = full_case_fold(char_at(state->text,
|
|
state->text_pos - 1), folded);
|
|
else
|
|
folded_len = 0;
|
|
|
|
folded_pos = folded_len;
|
|
}
|
|
|
|
/* Case-fold at current position in group. */
|
|
if (gfolded_pos <= 0) {
|
|
gfolded_len = full_case_fold(char_at(state->text,
|
|
string_pos - 1), gfolded);
|
|
gfolded_pos = gfolded_len;
|
|
}
|
|
|
|
if (folded_pos > 0 && folded[folded_pos - 1] ==
|
|
gfolded[gfolded_pos - 1]) {
|
|
--folded_pos;
|
|
--gfolded_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_group_fld(safe_state, search,
|
|
&state->text_pos, node, &folded_pos, folded_len,
|
|
&string_pos, &gfolded_pos, gfolded_len, &matched, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (folded_pos <= 0 && folded_len > 0)
|
|
--state->text_pos;
|
|
|
|
if (gfolded_pos <= 0)
|
|
--string_pos;
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
if (folded_pos > 0 || gfolded_pos > 0)
|
|
goto backtrack;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_REF_GROUP_IGN: /* Reference to a capture group, ignoring case. */
|
|
{
|
|
RE_GroupData* group;
|
|
RE_GroupSpan* span;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*
|
|
* Check whether the captured text, if any, exists at this position
|
|
* in the string.
|
|
*/
|
|
|
|
/* Did the group capture anything? */
|
|
group = &state->groups[node->values[0] - 1];
|
|
if (group->current_capture < 0)
|
|
goto backtrack;
|
|
|
|
span = &group->captures[group->current_capture];
|
|
|
|
if (string_pos < 0)
|
|
string_pos = span->start;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos < span->end) {
|
|
if (state->text_pos >= state->text_length &&
|
|
state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end &&
|
|
same_char_ign(encoding, char_at(state->text,
|
|
state->text_pos), char_at(state->text, string_pos))) {
|
|
++string_pos;
|
|
++state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &matched, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_REF_GROUP_IGN_REV: /* Reference to a capture group, ignoring case. */
|
|
{
|
|
RE_GroupData* group;
|
|
RE_GroupSpan* span;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*
|
|
* Check whether the captured text, if any, exists at this position
|
|
* in the string.
|
|
*/
|
|
|
|
/* Did the group capture anything? */
|
|
group = &state->groups[node->values[0] - 1];
|
|
if (group->current_capture < 0)
|
|
goto backtrack;
|
|
|
|
span = &group->captures[group->current_capture];
|
|
|
|
if (string_pos < 0)
|
|
string_pos = span->end;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos > span->start) {
|
|
if (state->text_pos <= 0 && state->partial_side ==
|
|
RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
same_char_ign(encoding, char_at(state->text, state->text_pos
|
|
- 1), char_at(state->text, string_pos - 1))) {
|
|
--string_pos;
|
|
--state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &matched, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_REF_GROUP_REV: /* Reference to a capture group. */
|
|
{
|
|
RE_GroupData* group;
|
|
RE_GroupSpan* span;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*
|
|
* Check whether the captured text, if any, exists at this position
|
|
* in the string.
|
|
*/
|
|
|
|
/* Did the group capture anything? */
|
|
group = &state->groups[node->values[0] - 1];
|
|
if (group->current_capture < 0)
|
|
goto backtrack;
|
|
|
|
span = &group->captures[group->current_capture];
|
|
|
|
if (string_pos < 0)
|
|
string_pos = span->end;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos > span->start) {
|
|
if (state->text_pos <= 0 && state->partial_side ==
|
|
RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start && same_char(encoding,
|
|
char_at(state->text, state->text_pos - 1),
|
|
char_at(state->text, string_pos - 1))) {
|
|
--string_pos;
|
|
--state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &matched, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_SEARCH_ANCHOR: /* At the start of the search. */
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[0]))
|
|
|
|
if (state->text_pos == state->search_anchor)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_SET_DIFF: /* Character set. */
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_UNION:
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->match))
|
|
|
|
if (state->text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end && matches_SET(encoding,
|
|
node, char_at(state->text, state->text_pos)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_SET_DIFF_IGN: /* Character set, ignoring case. */
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_UNION_IGN:
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->match))
|
|
|
|
if (state->text_pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end && matches_SET_IGN(encoding,
|
|
node, char_at(state->text, state->text_pos)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_SET_DIFF_IGN_REV: /* Character set, ignoring case. */
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->match))
|
|
|
|
if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
matches_SET_IGN(encoding, node, char_at(state->text,
|
|
state->text_pos - 1)) == node->match) {
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_SET_DIFF_REV: /* Character set. */
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->match))
|
|
|
|
if (state->text_pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start && matches_SET(encoding,
|
|
node, char_at(state->text, state->text_pos - 1)) == node->match)
|
|
{
|
|
state->text_pos += node->step;
|
|
node = node->next_1.node;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_START_GROUP: /* Start of a capture group. */
|
|
{
|
|
RE_CODE private_index;
|
|
RE_CODE public_index;
|
|
RE_GroupData* group;
|
|
RE_BacktrackData* bt_data;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->values[1]))
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is
|
|
* the entire matched string).
|
|
*/
|
|
private_index = node->values[0];
|
|
public_index = node->values[1];
|
|
group = &state->groups[private_index - 1];
|
|
|
|
if (!add_backtrack(safe_state, RE_OP_START_GROUP))
|
|
return RE_ERROR_BACKTRACKING;
|
|
bt_data = state->backtrack;
|
|
bt_data->group.private_index = private_index;
|
|
bt_data->group.public_index = public_index;
|
|
bt_data->group.text_pos = group->span.start;
|
|
bt_data->group.capture = (BOOL)node->values[2];
|
|
bt_data->group.current_capture = group->current_capture;
|
|
|
|
if (pattern->group_info[private_index - 1].referenced &&
|
|
group->span.start != state->text_pos)
|
|
++state->capture_change;
|
|
group->span.start = state->text_pos;
|
|
|
|
/* Save the capture? */
|
|
if (node->values[2]) {
|
|
group->current_capture = (Py_ssize_t)group->capture_count;
|
|
if (!save_capture(safe_state, private_index, public_index))
|
|
return RE_ERROR_MEMORY;
|
|
}
|
|
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_START_OF_LINE: /* At the start of a line. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_START_OF_LINE(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_START_OF_LINE_U: /* At the start of a line. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_START_OF_LINE_U(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_START_OF_STRING: /* At the start of the string. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_START_OF_STRING(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_START_OF_WORD: /* At the start of a word. */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = try_match_START_OF_WORD(state, node, state->text_pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
node = node->next_1.node;
|
|
else if (node->status & RE_STATUS_FUZZY) {
|
|
status = fuzzy_match_item(safe_state, search, &state->text_pos,
|
|
&node, 0);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (!node)
|
|
goto backtrack;
|
|
} else
|
|
goto backtrack;
|
|
break;
|
|
case RE_OP_STRING: /* A string. */
|
|
{
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->value_count))
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && state->text_pos ==
|
|
state->req_pos && string_pos < 0)
|
|
state->text_pos = state->req_end;
|
|
else {
|
|
length = (Py_ssize_t)node->value_count;
|
|
|
|
if (string_pos < 0)
|
|
string_pos = 0;
|
|
|
|
values = node->values;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos < length) {
|
|
if (state->text_pos >= state->text_length &&
|
|
state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end &&
|
|
same_char(encoding, char_at(state->text,
|
|
state->text_pos), values[string_pos])) {
|
|
++string_pos;
|
|
++state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &matched, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD: /* A string, ignoring case. */
|
|
{
|
|
Py_ssize_t length;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
RE_CODE* values;
|
|
int folded_len;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->value_count))
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && state->text_pos ==
|
|
state->req_pos && string_pos < 0)
|
|
state->text_pos = state->req_end;
|
|
else {
|
|
length = (Py_ssize_t)node->value_count;
|
|
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
if (string_pos < 0) {
|
|
string_pos = 0;
|
|
folded_pos = 0;
|
|
folded_len = 0;
|
|
} else {
|
|
folded_len = full_case_fold(char_at(state->text,
|
|
state->text_pos), folded);
|
|
if (folded_pos >= folded_len) {
|
|
if (state->text_pos >= state->slice_end)
|
|
goto backtrack;
|
|
|
|
++state->text_pos;
|
|
folded_pos = 0;
|
|
folded_len = 0;
|
|
}
|
|
}
|
|
|
|
values = node->values;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos < length) {
|
|
if (folded_pos >= folded_len) {
|
|
if (state->text_pos >= state->text_length &&
|
|
state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
folded_len = full_case_fold(char_at(state->text,
|
|
state->text_pos), folded);
|
|
folded_pos = 0;
|
|
}
|
|
|
|
if (same_char_ign(encoding, folded[folded_pos],
|
|
values[string_pos])) {
|
|
++string_pos;
|
|
++folded_pos;
|
|
|
|
if (folded_pos >= folded_len)
|
|
++state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string_fld(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &folded_pos,
|
|
folded_len, &matched, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (folded_pos >= folded_len)
|
|
++state->text_pos;
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
|
|
if (node->status & RE_STATUS_FUZZY) {
|
|
while (folded_pos < folded_len) {
|
|
BOOL matched;
|
|
|
|
if (!fuzzy_match_string_fld(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &folded_pos,
|
|
folded_len, &matched, 1))
|
|
return RE_ERROR_BACKTRACKING;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (folded_pos >= folded_len)
|
|
++state->text_pos;
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
if (folded_pos < folded_len)
|
|
goto backtrack;
|
|
}
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD_REV: /* A string, ignoring case. */
|
|
{
|
|
Py_ssize_t length;
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
RE_CODE* values;
|
|
int folded_len;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->value_count))
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && state->text_pos ==
|
|
state->req_pos && string_pos < 0)
|
|
state->text_pos = state->req_end;
|
|
else {
|
|
length = (Py_ssize_t)node->value_count;
|
|
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
if (string_pos < 0) {
|
|
string_pos = length;
|
|
folded_pos = 0;
|
|
folded_len = 0;
|
|
} else {
|
|
folded_len = full_case_fold(char_at(state->text,
|
|
state->text_pos - 1), folded);
|
|
if (folded_pos <= 0) {
|
|
if (state->text_pos <= state->slice_start)
|
|
goto backtrack;
|
|
|
|
--state->text_pos;
|
|
folded_pos = 0;
|
|
folded_len = 0;
|
|
}
|
|
}
|
|
|
|
values = node->values;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos > 0) {
|
|
if (folded_pos <= 0) {
|
|
if (state->text_pos <= 0 && state->partial_side ==
|
|
RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
folded_len = full_case_fold(char_at(state->text,
|
|
state->text_pos - 1), folded);
|
|
folded_pos = folded_len;
|
|
}
|
|
|
|
if (same_char_ign(encoding, folded[folded_pos - 1],
|
|
values[string_pos - 1])) {
|
|
--string_pos;
|
|
--folded_pos;
|
|
|
|
if (folded_pos <= 0)
|
|
--state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string_fld(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &folded_pos,
|
|
folded_len, &matched, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (folded_pos <= 0)
|
|
--state->text_pos;
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
|
|
if (node->status & RE_STATUS_FUZZY) {
|
|
while (folded_pos > 0) {
|
|
BOOL matched;
|
|
|
|
if (!fuzzy_match_string_fld(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &folded_pos,
|
|
folded_len, &matched, -1))
|
|
return RE_ERROR_BACKTRACKING;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (folded_pos <= 0)
|
|
--state->text_pos;
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
if (folded_pos > 0)
|
|
goto backtrack;
|
|
}
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN: /* A string, ignoring case. */
|
|
{
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->value_count))
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && state->text_pos ==
|
|
state->req_pos && string_pos < 0)
|
|
state->text_pos = state->req_end;
|
|
else {
|
|
length = (Py_ssize_t)node->value_count;
|
|
|
|
if (string_pos < 0)
|
|
string_pos = 0;
|
|
|
|
values = node->values;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos < length) {
|
|
if (state->text_pos >= state->text_length &&
|
|
state->partial_side == RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos < state->slice_end &&
|
|
same_char_ign(encoding, char_at(state->text,
|
|
state->text_pos), values[string_pos])) {
|
|
++string_pos;
|
|
++state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &matched, 1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN_REV: /* A string, ignoring case. */
|
|
{
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->value_count))
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && state->text_pos ==
|
|
state->req_pos && string_pos < 0)
|
|
state->text_pos = state->req_end;
|
|
else {
|
|
length = (Py_ssize_t)node->value_count;
|
|
|
|
if (string_pos < 0)
|
|
string_pos = length;
|
|
|
|
values = node->values;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos > 0) {
|
|
if (state->text_pos <= 0 && state->partial_side ==
|
|
RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
same_char_ign(encoding, char_at(state->text,
|
|
state->text_pos - 1), values[string_pos - 1])) {
|
|
--string_pos;
|
|
--state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &matched, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_REV: /* A string. */
|
|
{
|
|
Py_ssize_t length;
|
|
RE_CODE* values;
|
|
TRACE(("%s %d\n", re_op_text[node->op], node->value_count))
|
|
|
|
if ((node->status & RE_STATUS_REQUIRED) && state->text_pos ==
|
|
state->req_pos && string_pos < 0)
|
|
state->text_pos = state->req_end;
|
|
else {
|
|
length = (Py_ssize_t)node->value_count;
|
|
|
|
if (string_pos < 0)
|
|
string_pos = length;
|
|
|
|
values = node->values;
|
|
|
|
/* Try comparing. */
|
|
while (string_pos > 0) {
|
|
if (state->text_pos <= 0 && state->partial_side ==
|
|
RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (state->text_pos > state->slice_start &&
|
|
same_char(encoding, char_at(state->text, state->text_pos
|
|
- 1), values[string_pos - 1])) {
|
|
--string_pos;
|
|
--state->text_pos;
|
|
} else if (node->status & RE_STATUS_FUZZY) {
|
|
BOOL matched;
|
|
|
|
status = fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, node, &string_pos, &matched, -1);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (!matched) {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
} else {
|
|
string_pos = -1;
|
|
goto backtrack;
|
|
}
|
|
}
|
|
}
|
|
|
|
string_pos = -1;
|
|
|
|
/* Successful match. */
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_SET: /* Member of a string set. */
|
|
{
|
|
int status;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = string_set_match_fwdrev(safe_state, node, FALSE);
|
|
if (status < 0)
|
|
return status;
|
|
if (status == 0)
|
|
goto backtrack;
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_SET_FLD: /* Member of a string set, ignoring case. */
|
|
{
|
|
int status;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = string_set_match_fld_fwdrev(safe_state, node, FALSE);
|
|
if (status < 0)
|
|
return status;
|
|
if (status == 0)
|
|
goto backtrack;
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_SET_FLD_REV: /* Member of a string set, ignoring case. */
|
|
{
|
|
int status;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = string_set_match_fld_fwdrev(safe_state, node, TRUE);
|
|
if (status < 0)
|
|
return status;
|
|
if (status == 0)
|
|
goto backtrack;
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_SET_IGN: /* Member of a string set, ignoring case. */
|
|
{
|
|
int status;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = string_set_match_ign_fwdrev(safe_state, node, FALSE);
|
|
if (status < 0)
|
|
return status;
|
|
if (status == 0)
|
|
goto backtrack;
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_SET_IGN_REV: /* Member of a string set, ignoring case. */
|
|
{
|
|
int status;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = string_set_match_ign_fwdrev(safe_state, node, TRUE);
|
|
if (status < 0)
|
|
return status;
|
|
if (status == 0)
|
|
goto backtrack;
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_STRING_SET_REV: /* Member of a string set. */
|
|
{
|
|
int status;
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
status = string_set_match_fwdrev(safe_state, node, TRUE);
|
|
if (status < 0)
|
|
return status;
|
|
if (status == 0)
|
|
goto backtrack;
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
case RE_OP_SUCCESS: /* Success. */
|
|
/* Must the match advance past its start? */
|
|
TRACE(("%s\n", re_op_text[node->op]))
|
|
|
|
if (state->text_pos == state->search_anchor && state->must_advance)
|
|
goto backtrack;
|
|
|
|
if (state->match_all && !recursive_call) {
|
|
/* We want to match all of the slice. */
|
|
if (state->reverse) {
|
|
if (state->text_pos != state->slice_start)
|
|
goto backtrack;
|
|
} else {
|
|
if (state->text_pos != state->slice_end)
|
|
goto backtrack;
|
|
}
|
|
}
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
default: /* Illegal opcode! */
|
|
TRACE(("UNKNOWN OP %d\n", node->op))
|
|
return RE_ERROR_ILLEGAL;
|
|
}
|
|
}
|
|
|
|
backtrack:
|
|
for (;;) {
|
|
RE_BacktrackData* bt_data;
|
|
TRACE(("BACKTRACK "))
|
|
|
|
/* Should we abort the matching? */
|
|
++state->iterations;
|
|
|
|
if (state->iterations == 0 && safe_check_signals(safe_state))
|
|
return RE_ERROR_INTERRUPTED;
|
|
|
|
bt_data = last_backtrack(state);
|
|
|
|
switch (bt_data->op) {
|
|
case RE_OP_ANY: /* Any character except a newline. */
|
|
case RE_OP_ANY_ALL: /* Any character at all. */
|
|
case RE_OP_ANY_ALL_REV: /* Any character at all, backwards. */
|
|
case RE_OP_ANY_REV: /* Any character except a newline, backwards. */
|
|
case RE_OP_ANY_U: /* Any character except a line separator. */
|
|
case RE_OP_ANY_U_REV: /* Any character except a line separator, backwards. */
|
|
case RE_OP_CHARACTER: /* A character. */
|
|
case RE_OP_CHARACTER_IGN: /* A character, ignoring case. */
|
|
case RE_OP_CHARACTER_IGN_REV: /* A character, ignoring case, backwards. */
|
|
case RE_OP_CHARACTER_REV: /* A character, backwards. */
|
|
case RE_OP_PROPERTY: /* A property. */
|
|
case RE_OP_PROPERTY_IGN: /* A property, ignoring case. */
|
|
case RE_OP_PROPERTY_IGN_REV: /* A property, ignoring case, backwards. */
|
|
case RE_OP_PROPERTY_REV: /* A property, backwards. */
|
|
case RE_OP_RANGE: /* A range. */
|
|
case RE_OP_RANGE_IGN: /* A range, ignoring case. */
|
|
case RE_OP_RANGE_IGN_REV: /* A range, ignoring case, backwards. */
|
|
case RE_OP_RANGE_REV: /* A range, backwards. */
|
|
case RE_OP_SET_DIFF: /* Set difference. */
|
|
case RE_OP_SET_DIFF_IGN: /* Set difference, ignoring case. */
|
|
case RE_OP_SET_DIFF_IGN_REV: /* Set difference, ignoring case, backwards. */
|
|
case RE_OP_SET_DIFF_REV: /* Set difference, backwards. */
|
|
case RE_OP_SET_INTER: /* Set intersection. */
|
|
case RE_OP_SET_INTER_IGN: /* Set intersection, ignoring case. */
|
|
case RE_OP_SET_INTER_IGN_REV: /* Set intersection, ignoring case, backwards. */
|
|
case RE_OP_SET_INTER_REV: /* Set intersection, backwards. */
|
|
case RE_OP_SET_SYM_DIFF: /* Set symmetric difference. */
|
|
case RE_OP_SET_SYM_DIFF_IGN: /* Set symmetric difference, ignoring case. */
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV: /* Set symmetric difference, ignoring case, backwards. */
|
|
case RE_OP_SET_SYM_DIFF_REV: /* Set symmetric difference, backwards. */
|
|
case RE_OP_SET_UNION: /* Set union. */
|
|
case RE_OP_SET_UNION_IGN: /* Set union, ignoring case. */
|
|
case RE_OP_SET_UNION_IGN_REV: /* Set union, ignoring case, backwards. */
|
|
case RE_OP_SET_UNION_REV: /* Set union, backwards. */
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
status = retry_fuzzy_match_item(safe_state, search,
|
|
&state->text_pos, &node, TRUE);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (node)
|
|
goto advance;
|
|
break;
|
|
case RE_OP_ATOMIC: /* Atomic subpattern. */
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
/* Restore the groups and certain flags and then backtrack. */
|
|
pop_groups(state);
|
|
state->too_few_errors = bt_data->atomic.too_few_errors;
|
|
state->capture_change = bt_data->atomic.capture_change;
|
|
discard_backtrack(state);
|
|
break;
|
|
case RE_OP_BODY_END:
|
|
{
|
|
RE_RepeatData* rp_data;
|
|
TRACE(("%s %d\n", re_op_text[bt_data->op], bt_data->repeat.index))
|
|
|
|
/* We're backtracking into the body. */
|
|
rp_data = &state->repeats[bt_data->repeat.index];
|
|
|
|
/* Restore the repeat info. */
|
|
rp_data->count = bt_data->repeat.count;
|
|
rp_data->start = bt_data->repeat.start;
|
|
rp_data->capture_change = bt_data->repeat.capture_change;
|
|
|
|
discard_backtrack(state);
|
|
break;
|
|
}
|
|
case RE_OP_BODY_START:
|
|
{
|
|
TRACE(("%s %d\n", re_op_text[bt_data->op], bt_data->repeat.index))
|
|
|
|
/* The body may have failed to match at this position. */
|
|
if (!guard_repeat(safe_state, bt_data->repeat.index,
|
|
bt_data->repeat.text_pos, RE_STATUS_BODY, TRUE))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
discard_backtrack(state);
|
|
break;
|
|
}
|
|
case RE_OP_BOUNDARY: /* On a word boundary. */
|
|
case RE_OP_DEFAULT_BOUNDARY: /* On a default word boundary. */
|
|
case RE_OP_DEFAULT_END_OF_WORD: /* At a default end of a word. */
|
|
case RE_OP_DEFAULT_START_OF_WORD: /* At a default start of a word. */
|
|
case RE_OP_END_OF_LINE: /* At the end of a line. */
|
|
case RE_OP_END_OF_LINE_U: /* At the end of a line. */
|
|
case RE_OP_END_OF_STRING: /* At the end of the string. */
|
|
case RE_OP_END_OF_STRING_LINE: /* At end of string or final newline. */
|
|
case RE_OP_END_OF_STRING_LINE_U: /* At end of string or final newline. */
|
|
case RE_OP_END_OF_WORD: /* At end of a word. */
|
|
case RE_OP_GRAPHEME_BOUNDARY: /* On a grapheme boundary. */
|
|
case RE_OP_SEARCH_ANCHOR: /* At the start of the search. */
|
|
case RE_OP_START_OF_LINE: /* At the start of a line. */
|
|
case RE_OP_START_OF_LINE_U: /* At the start of a line. */
|
|
case RE_OP_START_OF_STRING: /* At the start of the string. */
|
|
case RE_OP_START_OF_WORD: /* At start of a word. */
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
status = retry_fuzzy_match_item(safe_state, search,
|
|
&state->text_pos, &node, FALSE);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (node)
|
|
goto advance;
|
|
break;
|
|
case RE_OP_BRANCH: /* 2-way branch. */
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
node = bt_data->branch.position.node;
|
|
state->text_pos = bt_data->branch.position.text_pos;
|
|
discard_backtrack(state);
|
|
goto advance;
|
|
case RE_OP_CALL_REF: /* A group call ref. */
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
pop_group_return(state);
|
|
discard_backtrack(state);
|
|
break;
|
|
case RE_OP_END_FUZZY: /* End of fuzzy matching. */
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
state->total_fuzzy_counts[RE_FUZZY_SUB] -=
|
|
state->fuzzy_info.counts[RE_FUZZY_SUB];
|
|
state->total_fuzzy_counts[RE_FUZZY_INS] -=
|
|
state->fuzzy_info.counts[RE_FUZZY_INS];
|
|
state->total_fuzzy_counts[RE_FUZZY_DEL] -=
|
|
state->fuzzy_info.counts[RE_FUZZY_DEL];
|
|
|
|
/* We need to retry the fuzzy match. */
|
|
status = retry_fuzzy_insert(safe_state, &state->text_pos, &node);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
/* If there were too few errors, in the fuzzy section, try again.
|
|
*/
|
|
if (state->too_few_errors) {
|
|
state->too_few_errors = FALSE;
|
|
goto backtrack;
|
|
}
|
|
|
|
if (node) {
|
|
state->total_fuzzy_counts[RE_FUZZY_SUB] +=
|
|
state->fuzzy_info.counts[RE_FUZZY_SUB];
|
|
state->total_fuzzy_counts[RE_FUZZY_INS] +=
|
|
state->fuzzy_info.counts[RE_FUZZY_INS];
|
|
state->total_fuzzy_counts[RE_FUZZY_DEL] +=
|
|
state->fuzzy_info.counts[RE_FUZZY_DEL];
|
|
|
|
node = node->next_1.node;
|
|
goto advance;
|
|
}
|
|
break;
|
|
case RE_OP_END_GROUP: /* End of a capture group. */
|
|
{
|
|
RE_CODE private_index;
|
|
RE_GroupData* group;
|
|
TRACE(("%s %d\n", re_op_text[bt_data->op],
|
|
bt_data->group.public_index))
|
|
|
|
private_index = bt_data->group.private_index;
|
|
group = &state->groups[private_index - 1];
|
|
|
|
/* Unsave the capture? */
|
|
if (bt_data->group.capture)
|
|
unsave_capture(state, bt_data->group.private_index,
|
|
bt_data->group.public_index);
|
|
|
|
if (pattern->group_info[private_index - 1].referenced &&
|
|
group->span.end != bt_data->group.text_pos)
|
|
--state->capture_change;
|
|
group->span.end = bt_data->group.text_pos;
|
|
group->current_capture = bt_data->group.current_capture;
|
|
|
|
discard_backtrack(state);
|
|
break;
|
|
}
|
|
case RE_OP_FAILURE:
|
|
{
|
|
Py_ssize_t end_pos;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
/* Do we have to advance? */
|
|
if (!search)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
/* Can we advance? */
|
|
state->text_pos = state->match_pos;
|
|
end_pos = state->reverse ? state->slice_start : state->slice_end;
|
|
if (state->text_pos == end_pos)
|
|
return RE_ERROR_FAILURE;
|
|
|
|
/* Skip over any repeated leading characters. */
|
|
switch (start_node->op) {
|
|
case RE_OP_GREEDY_REPEAT_ONE:
|
|
case RE_OP_LAZY_REPEAT_ONE:
|
|
{
|
|
size_t count;
|
|
BOOL is_partial;
|
|
|
|
/* How many characters did the repeat actually match? */
|
|
count = count_one(state, start_node->nonstring.next_2.node,
|
|
state->text_pos, start_node->values[2], &is_partial);
|
|
|
|
/* If it's fewer than the maximum then skip over those
|
|
* characters.
|
|
*/
|
|
if (count < start_node->values[2])
|
|
state->text_pos += (Py_ssize_t)count * pattern_step;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Advance and try to match again. */
|
|
state->text_pos += pattern_step;
|
|
|
|
goto start_match;
|
|
}
|
|
case RE_OP_FUZZY: /* Fuzzy matching. */
|
|
{
|
|
RE_FuzzyInfo* fuzzy_info;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
/* Restore the previous fuzzy info. */
|
|
fuzzy_info = &state->fuzzy_info;
|
|
memmove(fuzzy_info, &bt_data->fuzzy.fuzzy_info,
|
|
sizeof(RE_FuzzyInfo));
|
|
|
|
discard_backtrack(state);
|
|
break;
|
|
}
|
|
case RE_OP_GREEDY_REPEAT: /* Greedy repeat. */
|
|
case RE_OP_LAZY_REPEAT: /* Lazy repeat. */
|
|
{
|
|
RE_RepeatData* rp_data;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
/* The repeat failed to match. */
|
|
rp_data = &state->repeats[bt_data->repeat.index];
|
|
|
|
/* The body may have failed to match at this position. */
|
|
if (!guard_repeat(safe_state, bt_data->repeat.index,
|
|
bt_data->repeat.text_pos, RE_STATUS_BODY, TRUE))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Restore the previous repeat. */
|
|
rp_data->count = bt_data->repeat.count;
|
|
rp_data->start = bt_data->repeat.start;
|
|
rp_data->capture_change = bt_data->repeat.capture_change;
|
|
|
|
discard_backtrack(state);
|
|
break;
|
|
}
|
|
case RE_OP_GREEDY_REPEAT_ONE: /* Greedy repeat for one character. */
|
|
{
|
|
RE_RepeatData* rp_data;
|
|
size_t count;
|
|
Py_ssize_t step;
|
|
Py_ssize_t pos;
|
|
Py_ssize_t limit;
|
|
RE_Node* test;
|
|
BOOL match;
|
|
BOOL m;
|
|
size_t index;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
node = bt_data->repeat.position.node;
|
|
|
|
rp_data = &state->repeats[bt_data->repeat.index];
|
|
|
|
/* Unmatch one character at a time until the tail could match or we
|
|
* have reached the minimum.
|
|
*/
|
|
state->text_pos = rp_data->start;
|
|
|
|
count = rp_data->count;
|
|
step = node->step;
|
|
pos = state->text_pos + (Py_ssize_t)count * step;
|
|
limit = state->text_pos + (Py_ssize_t)node->values[1] * step;
|
|
|
|
/* The tail failed to match at this position. */
|
|
if (!guard_repeat(safe_state, bt_data->repeat.index, pos,
|
|
RE_STATUS_TAIL, TRUE))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
if (count == node->values[1]) {
|
|
/* We've backtracked the repeat as far as we can. */
|
|
rp_data->start = bt_data->repeat.text_pos;
|
|
rp_data->count = bt_data->repeat.count;
|
|
discard_backtrack(state);
|
|
break;
|
|
}
|
|
|
|
test = node->next_1.test;
|
|
|
|
m = test->match;
|
|
index = node->values[0];
|
|
|
|
match = FALSE;
|
|
|
|
if (test->status & RE_STATUS_FUZZY) {
|
|
for (;;) {
|
|
RE_Position next_position;
|
|
|
|
pos -= step;
|
|
|
|
if (try_match(state, &node->next_1, pos, &next_position) &&
|
|
!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (pos == limit)
|
|
break;
|
|
}
|
|
} else {
|
|
/* A repeated single-character match is often followed by a
|
|
* literal, so checking specially for it can be a good
|
|
* optimisation when working with long strings.
|
|
*/
|
|
switch (test->op) {
|
|
case RE_OP_CHARACTER:
|
|
{
|
|
Py_UCS4 ch;
|
|
|
|
ch = test->values[0];
|
|
|
|
for (;;) {
|
|
--pos;
|
|
|
|
if (same_char(encoding, char_at(state->text, pos), ch)
|
|
== m && !is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (pos == limit)
|
|
break;
|
|
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_CHARACTER_IGN:
|
|
{
|
|
Py_UCS4 ch;
|
|
|
|
ch = test->values[0];
|
|
|
|
for (;;) {
|
|
--pos;
|
|
|
|
if (same_char_ign(encoding, char_at(state->text, pos),
|
|
ch) == m && !is_repeat_guarded(safe_state, index,
|
|
pos, RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (pos == limit)
|
|
break;
|
|
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
{
|
|
Py_UCS4 ch;
|
|
|
|
ch = test->values[0];
|
|
|
|
for (;;) {
|
|
++pos;
|
|
|
|
if (same_char_ign(encoding, char_at(state->text, pos -
|
|
1), ch) == m && !is_repeat_guarded(safe_state, index,
|
|
pos, RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (pos == limit)
|
|
break;
|
|
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_CHARACTER_REV:
|
|
{
|
|
Py_UCS4 ch;
|
|
|
|
ch = test->values[0];
|
|
|
|
for (;;) {
|
|
++pos;
|
|
|
|
if (same_char(encoding, char_at(state->text, pos - 1),
|
|
ch) == m && !is_repeat_guarded(safe_state, index,
|
|
pos, RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (pos == limit)
|
|
break;
|
|
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING:
|
|
{
|
|
Py_ssize_t length;
|
|
|
|
length = (Py_ssize_t)test->value_count;
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
pos = min_ssize_t(pos, state->slice_end - length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
BOOL is_partial;
|
|
|
|
if (pos < limit)
|
|
break;
|
|
|
|
found = string_search_rev(safe_state, test, pos +
|
|
length, limit, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
pos = found - length;
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
--pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD:
|
|
{
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
Py_ssize_t folded_length;
|
|
size_t i;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
folded_length = 0;
|
|
for (i = 0; i < test->value_count; i++)
|
|
folded_length += full_case_fold(test->values[i],
|
|
folded);
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
pos = min_ssize_t(pos, state->slice_end - folded_length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
Py_ssize_t new_pos;
|
|
BOOL is_partial;
|
|
|
|
if (pos < limit)
|
|
break;
|
|
|
|
found = string_search_fld_rev(safe_state, test, pos +
|
|
folded_length, limit, &new_pos, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
pos = found - folded_length;
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
--pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD_REV:
|
|
{
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
Py_ssize_t folded_length;
|
|
size_t i;
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
folded_length = 0;
|
|
for (i = 0; i < test->value_count; i++)
|
|
folded_length += full_case_fold(test->values[i],
|
|
folded);
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
pos = max_ssize_t(pos, state->slice_start + folded_length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
Py_ssize_t new_pos;
|
|
BOOL is_partial;
|
|
|
|
if (pos > limit)
|
|
break;
|
|
|
|
found = string_search_fld(safe_state, test, pos -
|
|
folded_length, limit, &new_pos, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
pos = found + folded_length;
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
++pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN:
|
|
{
|
|
Py_ssize_t length;
|
|
|
|
length = (Py_ssize_t)test->value_count;
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
pos = min_ssize_t(pos, state->slice_end - length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
BOOL is_partial;
|
|
|
|
if (pos < limit)
|
|
break;
|
|
|
|
found = string_search_ign_rev(safe_state, test, pos +
|
|
length, limit, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
pos = found - length;
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
--pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN_REV:
|
|
{
|
|
Py_ssize_t length;
|
|
|
|
length = (Py_ssize_t)test->value_count;
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
pos = max_ssize_t(pos, state->slice_start + length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
BOOL is_partial;
|
|
|
|
if (pos > limit)
|
|
break;
|
|
|
|
found = string_search_ign(safe_state, test, pos -
|
|
length, limit, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
pos = found + length;
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
++pos;
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_REV:
|
|
{
|
|
Py_ssize_t length;
|
|
|
|
length = (Py_ssize_t)test->value_count;
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
pos = max_ssize_t(pos, state->slice_start + length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
BOOL is_partial;
|
|
|
|
if (pos > limit)
|
|
break;
|
|
|
|
found = string_search(safe_state, test, pos - length,
|
|
limit, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
pos = found + length;
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
++pos;
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
for (;;) {
|
|
RE_Position next_position;
|
|
|
|
pos -= step;
|
|
|
|
status = try_match(state, &node->next_1, pos,
|
|
&next_position);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS &&
|
|
!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (pos == limit)
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (match) {
|
|
count = (size_t)abs_ssize_t(pos - state->text_pos);
|
|
|
|
/* The tail could match. */
|
|
if (count > node->values[1])
|
|
/* The match is longer than the minimum, so we might need
|
|
* to backtrack the repeat again to consume less.
|
|
*/
|
|
rp_data->count = count;
|
|
else {
|
|
/* We've reached or passed the minimum, so we won't need to
|
|
* backtrack the repeat again.
|
|
*/
|
|
rp_data->start = bt_data->repeat.text_pos;
|
|
rp_data->count = bt_data->repeat.count;
|
|
discard_backtrack(state);
|
|
|
|
/* Have we passed the minimum? */
|
|
if (count < node->values[1])
|
|
goto backtrack;
|
|
}
|
|
|
|
node = node->next_1.node;
|
|
state->text_pos = pos;
|
|
goto advance;
|
|
} else {
|
|
/* We've backtracked the repeat as far as we can. */
|
|
rp_data->start = bt_data->repeat.text_pos;
|
|
rp_data->count = bt_data->repeat.count;
|
|
discard_backtrack(state);
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_GROUP_CALL: /* Group call. */
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
pop_group_return(state);
|
|
discard_backtrack(state);
|
|
break;
|
|
case RE_OP_GROUP_RETURN: /* Group return. */
|
|
{
|
|
RE_Node* return_node;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
return_node = bt_data->group_call.node;
|
|
|
|
push_group_return(safe_state, return_node);
|
|
|
|
if (return_node) {
|
|
/* Restore the groups. */
|
|
pop_groups(state);
|
|
state->capture_change = bt_data->group_call.capture_change;
|
|
|
|
/* Restore the repeats. */
|
|
pop_repeats(state);
|
|
}
|
|
|
|
discard_backtrack(state);
|
|
break;
|
|
}
|
|
case RE_OP_LAZY_REPEAT_ONE: /* Lazy repeat for one character. */
|
|
{
|
|
RE_RepeatData* rp_data;
|
|
size_t count;
|
|
Py_ssize_t step;
|
|
Py_ssize_t pos;
|
|
Py_ssize_t available;
|
|
size_t max_count;
|
|
Py_ssize_t limit;
|
|
RE_Node* repeated;
|
|
RE_Node* test;
|
|
BOOL match;
|
|
BOOL m;
|
|
size_t index;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
node = bt_data->repeat.position.node;
|
|
|
|
rp_data = &state->repeats[bt_data->repeat.index];
|
|
|
|
/* Match one character at a time until the tail could match or we
|
|
* have reached the maximum.
|
|
*/
|
|
state->text_pos = rp_data->start;
|
|
count = rp_data->count;
|
|
|
|
step = node->step;
|
|
pos = state->text_pos + (Py_ssize_t)count * step;
|
|
available = step > 0 ? state->slice_end - state->text_pos :
|
|
state->text_pos - state->slice_start;
|
|
max_count = min_size_t((size_t)available, node->values[2]);
|
|
limit = state->text_pos + (Py_ssize_t)max_count * step;
|
|
|
|
repeated = node->nonstring.next_2.node;
|
|
|
|
test = node->next_1.test;
|
|
|
|
m = test->match;
|
|
index = node->values[0];
|
|
|
|
match = FALSE;
|
|
|
|
if (test->status & RE_STATUS_FUZZY) {
|
|
for (;;) {
|
|
RE_Position next_position;
|
|
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
pos += step;
|
|
|
|
status = try_match(state, &node->next_1, pos,
|
|
&next_position);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_SUCCESS &&
|
|
!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (pos == limit)
|
|
break;
|
|
}
|
|
} else {
|
|
/* A repeated single-character match is often followed by a
|
|
* literal, so checking specially for it can be a good
|
|
* optimisation when working with long strings.
|
|
*/
|
|
switch (test->op) {
|
|
case RE_OP_CHARACTER:
|
|
{
|
|
Py_UCS4 ch;
|
|
|
|
ch = test->values[0];
|
|
|
|
/* The tail is a character. We don't want to go off the end
|
|
* of the slice.
|
|
*/
|
|
limit = min_ssize_t(limit, state->slice_end - 1);
|
|
|
|
for (;;) {
|
|
if (pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos >= limit)
|
|
break;
|
|
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
++pos;
|
|
|
|
if (same_char(encoding, char_at(state->text, pos), ch)
|
|
== m && !is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_CHARACTER_IGN:
|
|
{
|
|
Py_UCS4 ch;
|
|
|
|
ch = test->values[0];
|
|
|
|
/* The tail is a character. We don't want to go off the end
|
|
* of the slice.
|
|
*/
|
|
limit = min_ssize_t(limit, state->slice_end - 1);
|
|
|
|
for (;;) {
|
|
if (pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos >= limit)
|
|
break;
|
|
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
++pos;
|
|
|
|
if (same_char_ign(encoding, char_at(state->text, pos),
|
|
ch) == m && !is_repeat_guarded(safe_state, index,
|
|
pos, RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
{
|
|
Py_UCS4 ch;
|
|
|
|
ch = test->values[0];
|
|
|
|
/* The tail is a character. We don't want to go off the end
|
|
* of the slice.
|
|
*/
|
|
limit = max_ssize_t(limit, state->slice_start + 1);
|
|
|
|
for (;;) {
|
|
if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos <= limit)
|
|
break;
|
|
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
--pos;
|
|
|
|
if (same_char_ign(encoding, char_at(state->text, pos -
|
|
1), ch) == m && !is_repeat_guarded(safe_state, index,
|
|
pos, RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_CHARACTER_REV:
|
|
{
|
|
Py_UCS4 ch;
|
|
|
|
ch = test->values[0];
|
|
|
|
/* The tail is a character. We don't want to go off the end
|
|
* of the slice.
|
|
*/
|
|
limit = max_ssize_t(limit, state->slice_start + 1);
|
|
|
|
for (;;) {
|
|
if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos <= limit)
|
|
break;
|
|
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
--pos;
|
|
|
|
if (same_char(encoding, char_at(state->text, pos - 1),
|
|
ch) == m && !is_repeat_guarded(safe_state, index,
|
|
pos, RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING:
|
|
{
|
|
Py_ssize_t length;
|
|
|
|
length = (Py_ssize_t)test->value_count;
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
limit = min_ssize_t(limit, state->slice_end - length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
BOOL is_partial;
|
|
|
|
if (pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos >= limit)
|
|
break;
|
|
|
|
/* Look for the tail string. */
|
|
found = string_search(safe_state, test, pos + 1, limit
|
|
+ length, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
if (repeated->op == RE_OP_ANY_ALL)
|
|
/* Anything can precede the tail. */
|
|
pos = found;
|
|
else {
|
|
/* Check that what precedes the tail will match. */
|
|
while (pos != found) {
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
++pos;
|
|
}
|
|
|
|
if (pos != found)
|
|
/* Something preceding the tail didn't match.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD:
|
|
{
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
limit = min_ssize_t(limit, state->slice_end);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
Py_ssize_t new_pos;
|
|
BOOL is_partial;
|
|
|
|
if (pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos >= limit)
|
|
break;
|
|
|
|
/* Look for the tail string. */
|
|
found = string_search_fld(safe_state, test, pos + 1,
|
|
limit, &new_pos, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
if (repeated->op == RE_OP_ANY_ALL)
|
|
/* Anything can precede the tail. */
|
|
pos = found;
|
|
else {
|
|
/* Check that what precedes the tail will match. */
|
|
while (pos != found) {
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
++pos;
|
|
}
|
|
|
|
if (pos != found)
|
|
/* Something preceding the tail didn't match.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD_REV:
|
|
{
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
limit = max_ssize_t(limit, state->slice_start);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
Py_ssize_t new_pos;
|
|
BOOL is_partial;
|
|
|
|
if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos <= limit)
|
|
break;
|
|
|
|
/* Look for the tail string. */
|
|
found = string_search_fld_rev(safe_state, test, pos -
|
|
1, limit, &new_pos, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
if (repeated->op == RE_OP_ANY_ALL)
|
|
/* Anything can precede the tail. */
|
|
pos = found;
|
|
else {
|
|
/* Check that what precedes the tail will match. */
|
|
while (pos != found) {
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
--pos;
|
|
}
|
|
|
|
if (pos != found)
|
|
/* Something preceding the tail didn't match.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN:
|
|
{
|
|
Py_ssize_t length;
|
|
|
|
length = (Py_ssize_t)test->value_count;
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
limit = min_ssize_t(limit, state->slice_end - length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
BOOL is_partial;
|
|
|
|
if (pos >= state->text_length && state->partial_side ==
|
|
RE_PARTIAL_RIGHT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos >= limit)
|
|
break;
|
|
|
|
/* Look for the tail string. */
|
|
found = string_search_ign(safe_state, test, pos + 1,
|
|
limit + length, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
if (repeated->op == RE_OP_ANY_ALL)
|
|
/* Anything can precede the tail. */
|
|
pos = found;
|
|
else {
|
|
/* Check that what precedes the tail will match. */
|
|
while (pos != found) {
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
++pos;
|
|
}
|
|
|
|
if (pos != found)
|
|
/* Something preceding the tail didn't match.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_IGN_REV:
|
|
{
|
|
Py_ssize_t length;
|
|
|
|
length = (Py_ssize_t)test->value_count;
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
limit = max_ssize_t(limit, state->slice_start + length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
BOOL is_partial;
|
|
|
|
if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos <= limit)
|
|
break;
|
|
|
|
/* Look for the tail string. */
|
|
found = string_search_ign_rev(safe_state, test, pos -
|
|
1, limit - length, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
if (repeated->op == RE_OP_ANY_ALL)
|
|
/* Anything can precede the tail. */
|
|
pos = found;
|
|
else {
|
|
/* Check that what precedes the tail will match. */
|
|
while (pos != found) {
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
--pos;
|
|
}
|
|
|
|
if (pos != found)
|
|
/* Something preceding the tail didn't match.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_STRING_REV:
|
|
{
|
|
Py_ssize_t length;
|
|
|
|
length = (Py_ssize_t)test->value_count;
|
|
|
|
/* The tail is a string. We don't want to go off the end of
|
|
* the slice.
|
|
*/
|
|
limit = max_ssize_t(limit, state->slice_start + length);
|
|
|
|
for (;;) {
|
|
Py_ssize_t found;
|
|
BOOL is_partial;
|
|
|
|
if (pos <= 0 && state->partial_side == RE_PARTIAL_LEFT)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (pos <= limit)
|
|
break;
|
|
|
|
/* Look for the tail string. */
|
|
found = string_search_rev(safe_state, test, pos - 1,
|
|
limit - length, &is_partial);
|
|
if (is_partial)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (found < 0)
|
|
break;
|
|
|
|
if (repeated->op == RE_OP_ANY_ALL)
|
|
/* Anything can precede the tail. */
|
|
pos = found;
|
|
else {
|
|
/* Check that what precedes the tail will match. */
|
|
while (pos != found) {
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
--pos;
|
|
}
|
|
|
|
if (pos != found)
|
|
/* Something preceding the tail didn't match.
|
|
*/
|
|
break;
|
|
}
|
|
|
|
if (!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
for (;;) {
|
|
RE_Position next_position;
|
|
|
|
status = match_one(state, repeated, pos);
|
|
if (status < 0)
|
|
return status;
|
|
|
|
if (status == RE_ERROR_FAILURE)
|
|
break;
|
|
|
|
pos += step;
|
|
|
|
status = try_match(state, &node->next_1, pos,
|
|
&next_position);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (status == RE_ERROR_SUCCESS &&
|
|
!is_repeat_guarded(safe_state, index, pos,
|
|
RE_STATUS_TAIL)) {
|
|
match = TRUE;
|
|
break;
|
|
}
|
|
|
|
if (pos == limit)
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (match) {
|
|
/* The tail could match. */
|
|
count = (size_t)abs_ssize_t(pos - state->text_pos);
|
|
state->text_pos = pos;
|
|
|
|
if (count < max_count) {
|
|
/* The match is shorter than the maximum, so we might need
|
|
* to backtrack the repeat again to consume more.
|
|
*/
|
|
rp_data->count = count;
|
|
} else {
|
|
/* We've reached or passed the maximum, so we won't need to
|
|
* backtrack the repeat again.
|
|
*/
|
|
rp_data->start = bt_data->repeat.text_pos;
|
|
rp_data->count = bt_data->repeat.count;
|
|
discard_backtrack(state);
|
|
|
|
/* Have we passed the maximum? */
|
|
if (count > max_count)
|
|
goto backtrack;
|
|
}
|
|
|
|
node = node->next_1.node;
|
|
goto advance;
|
|
} else {
|
|
/* The tail couldn't match. */
|
|
rp_data->start = bt_data->repeat.text_pos;
|
|
rp_data->count = bt_data->repeat.count;
|
|
discard_backtrack(state);
|
|
}
|
|
break;
|
|
}
|
|
case RE_OP_LOOKAROUND: /* Lookaround. */
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
/* Restore the groups and certain flags and then backtrack. */
|
|
pop_groups(state);
|
|
state->too_few_errors = bt_data->lookaround.too_few_errors;
|
|
state->capture_change = bt_data->lookaround.capture_change;
|
|
discard_backtrack(state);
|
|
break;
|
|
case RE_OP_MATCH_BODY:
|
|
{
|
|
RE_RepeatData* rp_data;
|
|
TRACE(("%s %d\n", re_op_text[bt_data->op], bt_data->repeat.index))
|
|
|
|
/* We want to match the body. */
|
|
rp_data = &state->repeats[bt_data->repeat.index];
|
|
|
|
/* Restore the repeat info. */
|
|
rp_data->count = bt_data->repeat.count;
|
|
rp_data->start = bt_data->repeat.start;
|
|
rp_data->capture_change = bt_data->repeat.capture_change;
|
|
|
|
/* Record backtracking info in case the body fails to match. */
|
|
bt_data->op = RE_OP_BODY_START;
|
|
|
|
/* Advance into the body. */
|
|
node = bt_data->repeat.position.node;
|
|
state->text_pos = bt_data->repeat.position.text_pos;
|
|
goto advance;
|
|
}
|
|
case RE_OP_MATCH_TAIL:
|
|
{
|
|
RE_RepeatData* rp_data;
|
|
TRACE(("%s %d\n", re_op_text[bt_data->op], bt_data->repeat.index))
|
|
|
|
/* We want to match the tail. */
|
|
rp_data = &state->repeats[bt_data->repeat.index];
|
|
|
|
/* Restore the repeat info. */
|
|
rp_data->count = bt_data->repeat.count;
|
|
rp_data->start = bt_data->repeat.start;
|
|
rp_data->capture_change = bt_data->repeat.capture_change;
|
|
|
|
/* Advance into the tail. */
|
|
node = bt_data->repeat.position.node;
|
|
state->text_pos = bt_data->repeat.position.text_pos;
|
|
|
|
discard_backtrack(state);
|
|
goto advance;
|
|
}
|
|
case RE_OP_REF_GROUP: /* Reference to a capture group. */
|
|
case RE_OP_REF_GROUP_IGN: /* Reference to a capture group, ignoring case. */
|
|
case RE_OP_REF_GROUP_IGN_REV: /* Reference to a capture group, backwards, ignoring case. */
|
|
case RE_OP_REF_GROUP_REV: /* Reference to a capture group, backwards. */
|
|
case RE_OP_STRING: /* A string. */
|
|
case RE_OP_STRING_IGN: /* A string, ignoring case. */
|
|
case RE_OP_STRING_IGN_REV: /* A string, backwards, ignoring case. */
|
|
case RE_OP_STRING_REV: /* A string, backwards. */
|
|
{
|
|
BOOL matched;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
status = retry_fuzzy_match_string(safe_state, search,
|
|
&state->text_pos, &node, &string_pos, &matched);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
|
|
if (matched)
|
|
goto advance;
|
|
|
|
string_pos = -1;
|
|
break;
|
|
}
|
|
case RE_OP_REF_GROUP_FLD: /* Reference to a capture group, ignoring case. */
|
|
case RE_OP_REF_GROUP_FLD_REV: /* Reference to a capture group, backwards, ignoring case. */
|
|
{
|
|
BOOL matched;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
status = retry_fuzzy_match_group_fld(safe_state, search,
|
|
&state->text_pos, &node, &folded_pos, &string_pos, &gfolded_pos,
|
|
&matched);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
|
|
if (matched)
|
|
goto advance;
|
|
|
|
string_pos = -1;
|
|
break;
|
|
}
|
|
case RE_OP_START_GROUP: /* Start of a capture group. */
|
|
{
|
|
RE_CODE private_index;
|
|
RE_GroupData* group;
|
|
TRACE(("%s %d\n", re_op_text[bt_data->op],
|
|
bt_data->group.public_index))
|
|
|
|
private_index = bt_data->group.private_index;
|
|
group = &state->groups[private_index - 1];
|
|
|
|
/* Unsave the capture? */
|
|
if (bt_data->group.capture)
|
|
unsave_capture(state, bt_data->group.private_index,
|
|
bt_data->group.public_index);
|
|
|
|
if (pattern->group_info[private_index - 1].referenced &&
|
|
group->span.start != bt_data->group.text_pos)
|
|
--state->capture_change;
|
|
group->span.start = bt_data->group.text_pos;
|
|
group->current_capture = bt_data->group.current_capture;
|
|
|
|
discard_backtrack(state);
|
|
break;
|
|
}
|
|
case RE_OP_STRING_FLD: /* A string, ignoring case. */
|
|
case RE_OP_STRING_FLD_REV: /* A string, backwards, ignoring case. */
|
|
{
|
|
BOOL matched;
|
|
TRACE(("%s\n", re_op_text[bt_data->op]))
|
|
|
|
status = retry_fuzzy_match_string_fld(safe_state, search,
|
|
&state->text_pos, &node, &string_pos, &folded_pos, &matched);
|
|
if (status < 0)
|
|
return RE_ERROR_PARTIAL;
|
|
|
|
if (matched)
|
|
goto advance;
|
|
|
|
string_pos = -1;
|
|
break;
|
|
}
|
|
default:
|
|
TRACE(("UNKNOWN OP %d\n", bt_data->op))
|
|
return RE_ERROR_ILLEGAL;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Saves group data for fuzzy matching. */
|
|
Py_LOCAL_INLINE(RE_GroupData*) save_groups(RE_SafeState* safe_state,
|
|
RE_GroupData* saved_groups) {
|
|
RE_State* state;
|
|
PatternObject* pattern;
|
|
size_t g;
|
|
|
|
/* Re-acquire the GIL. */
|
|
acquire_GIL(safe_state);
|
|
|
|
state = safe_state->re_state;
|
|
pattern = state->pattern;
|
|
|
|
if (!saved_groups) {
|
|
saved_groups = (RE_GroupData*)re_alloc(pattern->true_group_count *
|
|
sizeof(RE_GroupData));
|
|
if (!saved_groups)
|
|
goto error;
|
|
|
|
memset(saved_groups, 0, pattern->true_group_count *
|
|
sizeof(RE_GroupData));
|
|
}
|
|
|
|
for (g = 0; g < pattern->true_group_count; g++) {
|
|
RE_GroupData* orig;
|
|
RE_GroupData* copy;
|
|
|
|
orig = &state->groups[g];
|
|
copy = &saved_groups[g];
|
|
|
|
copy->span = orig->span;
|
|
|
|
if (orig->capture_count > copy->capture_capacity) {
|
|
RE_GroupSpan* cap_copy;
|
|
|
|
cap_copy = (RE_GroupSpan*)re_realloc(copy->captures,
|
|
orig->capture_count * sizeof(RE_GroupSpan));
|
|
if (!cap_copy)
|
|
goto error;
|
|
|
|
copy->capture_capacity = orig->capture_count;
|
|
copy->captures = cap_copy;
|
|
}
|
|
|
|
copy->capture_count = orig->capture_count;
|
|
Py_MEMCPY(copy->captures, orig->captures, orig->capture_count *
|
|
sizeof(RE_GroupSpan));
|
|
}
|
|
|
|
/* Release the GIL. */
|
|
release_GIL(safe_state);
|
|
|
|
return saved_groups;
|
|
|
|
error:
|
|
if (saved_groups) {
|
|
for (g = 0; g < pattern->true_group_count; g++)
|
|
re_dealloc(saved_groups[g].captures);
|
|
|
|
re_dealloc(saved_groups);
|
|
}
|
|
|
|
/* Release the GIL. */
|
|
release_GIL(safe_state);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* Restores group data for fuzzy matching. */
|
|
Py_LOCAL_INLINE(void) restore_groups(RE_SafeState* safe_state, RE_GroupData*
|
|
saved_groups) {
|
|
RE_State* state;
|
|
PatternObject* pattern;
|
|
size_t g;
|
|
|
|
/* Re-acquire the GIL. */
|
|
acquire_GIL(safe_state);
|
|
|
|
state = safe_state->re_state;
|
|
pattern = state->pattern;
|
|
|
|
for (g = 0; g < pattern->true_group_count; g++)
|
|
re_dealloc(state->groups[g].captures);
|
|
|
|
Py_MEMCPY(state->groups, saved_groups, pattern->true_group_count *
|
|
sizeof(RE_GroupData));
|
|
|
|
re_dealloc(saved_groups);
|
|
|
|
/* Release the GIL. */
|
|
release_GIL(safe_state);
|
|
}
|
|
|
|
/* Discards group data for fuzzy matching. */
|
|
Py_LOCAL_INLINE(void) discard_groups(RE_SafeState* safe_state, RE_GroupData*
|
|
saved_groups) {
|
|
RE_State* state;
|
|
PatternObject* pattern;
|
|
size_t g;
|
|
|
|
/* Re-acquire the GIL. */
|
|
acquire_GIL(safe_state);
|
|
|
|
state = safe_state->re_state;
|
|
pattern = state->pattern;
|
|
|
|
for (g = 0; g < pattern->true_group_count; g++)
|
|
re_dealloc(saved_groups[g].captures);
|
|
|
|
re_dealloc(saved_groups);
|
|
|
|
/* Release the GIL. */
|
|
release_GIL(safe_state);
|
|
}
|
|
|
|
/* Saves the fuzzy info. */
|
|
Py_LOCAL_INLINE(void) save_fuzzy_counts(RE_State* state, size_t* fuzzy_counts)
|
|
{
|
|
Py_MEMCPY(fuzzy_counts, state->total_fuzzy_counts,
|
|
sizeof(state->total_fuzzy_counts));
|
|
}
|
|
|
|
/* Restores the fuzzy info. */
|
|
Py_LOCAL_INLINE(void) restore_fuzzy_counts(RE_State* state, size_t*
|
|
fuzzy_counts) {
|
|
Py_MEMCPY(state->total_fuzzy_counts, fuzzy_counts,
|
|
sizeof(state->total_fuzzy_counts));
|
|
}
|
|
|
|
/* Performs a match or search from the current text position.
|
|
*
|
|
* The state can sometimes be shared across threads. In such instances there's
|
|
* a lock (mutex) on it. The lock is held for the duration of matching.
|
|
*/
|
|
Py_LOCAL_INLINE(int) do_match(RE_SafeState* safe_state, BOOL search) {
|
|
RE_State* state;
|
|
PatternObject* pattern;
|
|
Py_ssize_t available;
|
|
BOOL get_best;
|
|
BOOL enhance_match;
|
|
BOOL must_advance;
|
|
RE_GroupData* best_groups;
|
|
Py_ssize_t best_match_pos;
|
|
Py_ssize_t best_text_pos = 0; /* Initialise to stop compiler warning. */
|
|
int status;
|
|
Py_ssize_t slice_start;
|
|
Py_ssize_t slice_end;
|
|
size_t best_fuzzy_counts[RE_FUZZY_COUNT];
|
|
TRACE(("<<do_match>>\n"))
|
|
|
|
state = safe_state->re_state;
|
|
pattern = state->pattern;
|
|
|
|
/* Release the GIL. */
|
|
release_GIL(safe_state);
|
|
|
|
/* Is there enough to search? */
|
|
if (state->reverse) {
|
|
if (state->text_pos < state->slice_start) {
|
|
acquire_GIL(safe_state);
|
|
return FALSE;
|
|
}
|
|
|
|
available = state->text_pos - state->slice_start;
|
|
} else {
|
|
if (state->text_pos > state->slice_end) {
|
|
acquire_GIL(safe_state);
|
|
return FALSE;
|
|
}
|
|
|
|
available = state->slice_end - state->text_pos;
|
|
}
|
|
|
|
get_best = (pattern->flags & RE_FLAG_BESTMATCH) != 0;
|
|
enhance_match = (pattern->flags & RE_FLAG_ENHANCEMATCH) != 0 && !get_best;
|
|
|
|
/* The maximum permitted cost. */
|
|
state->max_cost = pattern->is_fuzzy ? PY_SSIZE_T_MAX : 0;
|
|
|
|
best_groups = NULL;
|
|
|
|
best_match_pos = state->text_pos;
|
|
must_advance = state->must_advance;
|
|
|
|
slice_start = state->slice_start;
|
|
slice_end = state->slice_end;
|
|
|
|
for (;;) {
|
|
/* If there's a better match, it won't start earlier in the string than
|
|
* the current best match, so there's no need to start earlier than
|
|
* that match.
|
|
*/
|
|
state->text_pos = best_match_pos;
|
|
state->must_advance = must_advance;
|
|
|
|
/* Initialise the state. */
|
|
init_match(state);
|
|
|
|
status = RE_ERROR_SUCCESS;
|
|
if (state->max_cost == 0 && state->partial_side == RE_PARTIAL_NONE) {
|
|
/* An exact match, and partial matches not permitted. */
|
|
if (available < state->min_width || (available == 0 &&
|
|
state->must_advance))
|
|
status = RE_ERROR_FAILURE;
|
|
}
|
|
|
|
if (status == RE_ERROR_SUCCESS)
|
|
status = basic_match(safe_state, pattern->start_node, search,
|
|
FALSE);
|
|
|
|
/* Has an error occurred, or is it a partial match? */
|
|
if (status < 0)
|
|
break;
|
|
|
|
if (status == RE_ERROR_FAILURE || (status == RE_ERROR_SUCCESS &&
|
|
state->total_cost == 0))
|
|
break;
|
|
|
|
if (!get_best && !enhance_match)
|
|
break;
|
|
|
|
save_fuzzy_counts(state, best_fuzzy_counts);
|
|
|
|
if (!get_best && state->text_pos == state->match_pos)
|
|
/* We want the first match. The match is already zero-width, so the
|
|
* cost can't get any lower (because the fit can't get any better).
|
|
*/
|
|
break;
|
|
|
|
if (best_groups) {
|
|
BOOL same;
|
|
size_t g;
|
|
|
|
/* Did we get the same match as the best so far? */
|
|
same = state->match_pos == best_match_pos && state->text_pos ==
|
|
best_text_pos;
|
|
for (g = 0; same && g < pattern->public_group_count; g++) {
|
|
same = state->groups[g].span.start == best_groups[g].span.start
|
|
&& state->groups[g].span.end == best_groups[g].span.end;
|
|
}
|
|
|
|
if (same)
|
|
break;
|
|
}
|
|
|
|
/* Save the best result so far. */
|
|
best_groups = save_groups(safe_state, best_groups);
|
|
if (!best_groups) {
|
|
status = RE_ERROR_MEMORY;
|
|
break;
|
|
}
|
|
|
|
best_match_pos = state->match_pos;
|
|
best_text_pos = state->text_pos;
|
|
|
|
if (state->max_cost == 0)
|
|
break;
|
|
|
|
/* Reduce the maximum permitted cost and try again. */
|
|
state->max_cost = state->total_cost - 1;
|
|
|
|
if (enhance_match) {
|
|
if (state->reverse) {
|
|
state->slice_start = state->text_pos;
|
|
state->slice_end = state->match_pos;
|
|
} else {
|
|
state->slice_start = state->match_pos;
|
|
state->slice_end = state->text_pos;
|
|
}
|
|
}
|
|
}
|
|
|
|
state->slice_start = slice_start;
|
|
state->slice_end = slice_end;
|
|
|
|
if (best_groups) {
|
|
if (status == RE_ERROR_SUCCESS && state->total_cost == 0)
|
|
/* We have a perfect match, so the previous best match. */
|
|
discard_groups(safe_state, best_groups);
|
|
else {
|
|
/* Restore the previous best match. */
|
|
status = RE_ERROR_SUCCESS;
|
|
|
|
state->match_pos = best_match_pos;
|
|
state->text_pos = best_text_pos;
|
|
|
|
restore_groups(safe_state, best_groups);
|
|
restore_fuzzy_counts(state, best_fuzzy_counts);
|
|
}
|
|
}
|
|
|
|
if (status == RE_ERROR_SUCCESS || status == RE_ERROR_PARTIAL) {
|
|
Py_ssize_t max_end_index;
|
|
RE_GroupInfo* group_info;
|
|
size_t g;
|
|
|
|
/* Store the results. */
|
|
state->lastindex = -1;
|
|
state->lastgroup = -1;
|
|
max_end_index = -1;
|
|
|
|
/* Store the capture groups. */
|
|
group_info = pattern->group_info;
|
|
|
|
for (g = 0; g < pattern->public_group_count; g++) {
|
|
RE_GroupSpan* span;
|
|
|
|
span = &state->groups[g].span;
|
|
/* The string positions are of type Py_ssize_t, so the format needs
|
|
* to specify that.
|
|
*/
|
|
TRACE(("group %d from %" PY_FORMAT_SIZE_T "d to %" PY_FORMAT_SIZE_T
|
|
"d\n", g + 1, span->start, span->end))
|
|
|
|
if (span->start >= 0 && span->end >= 0 && group_info[g].end_index >
|
|
max_end_index) {
|
|
max_end_index = group_info[g].end_index;
|
|
state->lastindex = (Py_ssize_t)g + 1;
|
|
if (group_info[g].has_name)
|
|
state->lastgroup = (Py_ssize_t)g + 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Re-acquire the GIL. */
|
|
acquire_GIL(safe_state);
|
|
|
|
if (status < 0 && status != RE_ERROR_PARTIAL && !PyErr_Occurred())
|
|
set_error(status, NULL);
|
|
|
|
return status;
|
|
}
|
|
|
|
/* Gets a string from a Python object.
|
|
*
|
|
* If the function returns true and str_info->should_release is true then it's
|
|
* the responsibility of the caller to release the buffer when it's no longer
|
|
* needed.
|
|
*/
|
|
Py_LOCAL_INLINE(BOOL) get_string(PyObject* string, RE_StringInfo* str_info) {
|
|
/* Given a Python object, return a data pointer, a length (in characters),
|
|
* and a character size. Return FALSE if the object is not a string (or not
|
|
* compatible).
|
|
*/
|
|
PyBufferProcs* buffer;
|
|
Py_ssize_t bytes;
|
|
Py_ssize_t size;
|
|
|
|
/* Unicode objects do not support the buffer API. So, get the data directly
|
|
* instead.
|
|
*/
|
|
if (PyUnicode_Check(string)) {
|
|
/* Unicode strings doesn't always support the buffer interface. */
|
|
str_info->characters = (void*)PyUnicode_AS_DATA(string);
|
|
str_info->length = PyUnicode_GET_SIZE(string);
|
|
str_info->charsize = sizeof(Py_UNICODE);
|
|
str_info->is_unicode = TRUE;
|
|
str_info->should_release = FALSE;
|
|
return TRUE;
|
|
}
|
|
|
|
/* Get pointer to string buffer. */
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
buffer = Py_TYPE(string)->tp_as_buffer;
|
|
str_info->view.len = -1;
|
|
#else
|
|
buffer = string->ob_type->tp_as_buffer;
|
|
#endif
|
|
|
|
if (!buffer) {
|
|
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
|
|
return FALSE;
|
|
}
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
if (buffer->bf_getbuffer && (*buffer->bf_getbuffer)(string,
|
|
&str_info->view, PyBUF_SIMPLE) >= 0)
|
|
/* It's a new-style buffer. */
|
|
str_info->should_release = TRUE;
|
|
else
|
|
#endif
|
|
if (buffer->bf_getreadbuffer && buffer->bf_getsegcount &&
|
|
buffer->bf_getsegcount(string, NULL) == 1)
|
|
/* It's an old-style buffer. */
|
|
str_info->should_release = FALSE;
|
|
else {
|
|
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
|
|
return FALSE;
|
|
}
|
|
|
|
/* Determine buffer size. */
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
if (str_info->should_release) {
|
|
/* It's a new-style buffer. */
|
|
bytes = str_info->view.len;
|
|
str_info->characters = str_info->view.buf;
|
|
|
|
if (str_info->characters == NULL) {
|
|
PyBuffer_Release(&str_info->view);
|
|
PyErr_SetString(PyExc_ValueError, "buffer is NULL");
|
|
return FALSE;
|
|
}
|
|
} else
|
|
#endif
|
|
/* It's an old-style buffer. */
|
|
bytes = buffer->bf_getreadbuffer(string, 0, &str_info->characters);
|
|
|
|
if (bytes < 0) {
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
if (str_info->should_release)
|
|
PyBuffer_Release(&str_info->view);
|
|
#endif
|
|
PyErr_SetString(PyExc_TypeError, "buffer has negative size");
|
|
return FALSE;
|
|
}
|
|
|
|
/* Determine character size. */
|
|
size = PyObject_Size(string);
|
|
|
|
if (PyString_Check(string) || bytes == size)
|
|
str_info->charsize = 1;
|
|
else {
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
if (str_info->should_release)
|
|
PyBuffer_Release(&str_info->view);
|
|
#endif
|
|
PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
|
|
return FALSE;
|
|
}
|
|
|
|
str_info->length = size;
|
|
str_info->is_unicode = FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Deallocates the groups storage. */
|
|
Py_LOCAL_INLINE(void) dealloc_groups(RE_GroupData* groups, size_t group_count)
|
|
{
|
|
size_t g;
|
|
|
|
if (!groups)
|
|
return;
|
|
|
|
for (g = 0; g < group_count; g++)
|
|
re_dealloc(groups[g].captures);
|
|
|
|
re_dealloc(groups);
|
|
}
|
|
|
|
/* Initialises a state object. */
|
|
Py_LOCAL_INLINE(BOOL) state_init_2(RE_State* state, PatternObject* pattern,
|
|
PyObject* string, RE_StringInfo* str_info, Py_ssize_t start, Py_ssize_t end,
|
|
BOOL overlapped, int concurrent, BOOL partial, BOOL use_lock, BOOL
|
|
visible_captures, BOOL match_all) {
|
|
Py_ssize_t final_pos;
|
|
int i;
|
|
|
|
state->groups = NULL;
|
|
state->repeats = NULL;
|
|
state->visible_captures = visible_captures;
|
|
state->match_all = match_all;
|
|
state->backtrack_block.previous = NULL;
|
|
state->backtrack_block.next = NULL;
|
|
state->backtrack_block.capacity = RE_BACKTRACK_BLOCK_SIZE;
|
|
state->backtrack_allocated = RE_BACKTRACK_BLOCK_SIZE;
|
|
state->first_saved_groups = NULL;
|
|
state->current_saved_groups = NULL;
|
|
state->first_saved_repeats = NULL;
|
|
state->current_saved_repeats = NULL;
|
|
state->lock = NULL;
|
|
state->fuzzy_guards = NULL;
|
|
state->first_group_call_frame = NULL;
|
|
state->current_group_call_frame = NULL;
|
|
state->group_call_guard_list = NULL;
|
|
state->req_pos = -1;
|
|
|
|
/* The call guards used by recursive patterns. */
|
|
if (pattern->call_ref_info_count > 0) {
|
|
state->group_call_guard_list =
|
|
(RE_GuardList*)re_alloc(pattern->call_ref_info_count *
|
|
sizeof(RE_GuardList));
|
|
if (!state->group_call_guard_list)
|
|
goto error;
|
|
memset(state->group_call_guard_list, 0, pattern->call_ref_info_count *
|
|
sizeof(RE_GuardList));
|
|
}
|
|
|
|
/* The capture groups. */
|
|
if (pattern->true_group_count) {
|
|
size_t g;
|
|
|
|
if (pattern->groups_storage) {
|
|
state->groups = pattern->groups_storage;
|
|
pattern->groups_storage = NULL;
|
|
} else {
|
|
state->groups = (RE_GroupData*)re_alloc(pattern->true_group_count *
|
|
sizeof(RE_GroupData));
|
|
if (!state->groups)
|
|
goto error;
|
|
memset(state->groups, 0, pattern->true_group_count *
|
|
sizeof(RE_GroupData));
|
|
|
|
for (g = 0; g < pattern->true_group_count; g++) {
|
|
RE_GroupSpan* captures;
|
|
|
|
captures = (RE_GroupSpan*)re_alloc(sizeof(RE_GroupSpan));
|
|
if (!captures) {
|
|
size_t i;
|
|
|
|
for (i = 0; i < g; i++)
|
|
re_dealloc(state->groups[i].captures);
|
|
|
|
goto error;
|
|
}
|
|
|
|
state->groups[g].captures = captures;
|
|
state->groups[g].capture_capacity = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Adjust boundaries. */
|
|
if (start < 0)
|
|
start += str_info->length;
|
|
if (start < 0)
|
|
start = 0;
|
|
else if (start > str_info->length)
|
|
start = str_info->length;
|
|
|
|
if (end < 0)
|
|
end += str_info->length;
|
|
if (end < 0)
|
|
end = 0;
|
|
else if (end > str_info->length)
|
|
end = str_info->length;
|
|
|
|
state->overlapped = overlapped;
|
|
state->min_width = pattern->min_width;
|
|
|
|
/* Initialise the getters and setters for the character size. */
|
|
state->charsize = str_info->charsize;
|
|
state->is_unicode = str_info->is_unicode;
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
/* Are we using a buffer object? If so, we need to copy the info. */
|
|
state->should_release = str_info->should_release;
|
|
if (state->should_release)
|
|
state->view = str_info->view;
|
|
|
|
#endif
|
|
switch (state->charsize) {
|
|
case 1:
|
|
state->char_at = bytes1_char_at;
|
|
state->set_char_at = bytes1_set_char_at;
|
|
state->point_to = bytes1_point_to;
|
|
break;
|
|
case 2:
|
|
state->char_at = bytes2_char_at;
|
|
state->set_char_at = bytes2_set_char_at;
|
|
state->point_to = bytes2_point_to;
|
|
break;
|
|
case 4:
|
|
state->char_at = bytes4_char_at;
|
|
state->set_char_at = bytes4_set_char_at;
|
|
state->point_to = bytes4_point_to;
|
|
break;
|
|
default:
|
|
goto error;
|
|
}
|
|
|
|
state->encoding = pattern->encoding;
|
|
|
|
/* The state object contains a reference to the string and also a pointer
|
|
* to its contents.
|
|
*
|
|
* The documentation says that the end of the slice behaves like the end of
|
|
* the string.
|
|
*/
|
|
state->text = str_info->characters;
|
|
state->text_length = end;
|
|
|
|
state->reverse = (pattern->flags & RE_FLAG_REVERSE) != 0;
|
|
if (partial)
|
|
state->partial_side = state->reverse ? RE_PARTIAL_LEFT :
|
|
RE_PARTIAL_RIGHT;
|
|
else
|
|
state->partial_side = RE_PARTIAL_NONE;
|
|
|
|
state->slice_start = start;
|
|
state->slice_end = state->text_length;
|
|
state->text_pos = state->reverse ? state->slice_end : state->slice_start;
|
|
|
|
/* Point to the final newline and line separator if it's at the end of the
|
|
* string, otherwise just -1.
|
|
*/
|
|
state->final_newline = -1;
|
|
state->final_line_sep = -1;
|
|
final_pos = state->text_length - 1;
|
|
if (final_pos >= 0) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = state->char_at(state->text, final_pos);
|
|
if (ch == 0x0A) {
|
|
/* The string ends with LF. */
|
|
state->final_newline = final_pos;
|
|
state->final_line_sep = final_pos;
|
|
|
|
/* Does the string end with CR/LF? */
|
|
--final_pos;
|
|
if (final_pos >= 0 && state->char_at(state->text, final_pos) ==
|
|
0x0D)
|
|
state->final_line_sep = final_pos;
|
|
} else {
|
|
/* The string doesn't end with LF, but it could be another kind of
|
|
* line separator.
|
|
*/
|
|
if (state->encoding->is_line_sep(ch))
|
|
state->final_line_sep = final_pos;
|
|
}
|
|
}
|
|
|
|
/* If the 'new' behaviour is enabled then split correctly on zero-width
|
|
* matches.
|
|
*/
|
|
state->version_0 = (pattern->flags & RE_FLAG_VERSION1) == 0;
|
|
state->must_advance = FALSE;
|
|
|
|
state->pattern = pattern;
|
|
state->string = string;
|
|
|
|
if (pattern->repeat_count) {
|
|
if (pattern->repeats_storage) {
|
|
state->repeats = pattern->repeats_storage;
|
|
pattern->repeats_storage = NULL;
|
|
} else {
|
|
state->repeats = (RE_RepeatData*)re_alloc(pattern->repeat_count *
|
|
sizeof(RE_RepeatData));
|
|
if (!state->repeats)
|
|
goto error;
|
|
memset(state->repeats, 0, pattern->repeat_count *
|
|
sizeof(RE_RepeatData));
|
|
}
|
|
}
|
|
|
|
if (pattern->fuzzy_count) {
|
|
state->fuzzy_guards = (RE_FuzzyGuards*)re_alloc(pattern->fuzzy_count *
|
|
sizeof(RE_FuzzyGuards));
|
|
if (!state->fuzzy_guards)
|
|
goto error;
|
|
memset(state->fuzzy_guards, 0, pattern->fuzzy_count *
|
|
sizeof(RE_FuzzyGuards));
|
|
}
|
|
|
|
Py_INCREF(state->pattern);
|
|
Py_INCREF(state->string);
|
|
|
|
/* Multithreading is allowed during matching when explicitly enabled or on
|
|
* immutable strings.
|
|
*/
|
|
switch (concurrent) {
|
|
case RE_CONC_NO:
|
|
state->is_multithreaded = FALSE;
|
|
break;
|
|
case RE_CONC_YES:
|
|
state->is_multithreaded = TRUE;
|
|
break;
|
|
default:
|
|
state->is_multithreaded = PyUnicode_Check(string) ||
|
|
PyString_Check(string);
|
|
break;
|
|
}
|
|
|
|
/* A state struct can sometimes be shared across threads. In such
|
|
* instances, if multithreading is enabled we need to protect the state
|
|
* with a lock (mutex) during matching.
|
|
*/
|
|
if (state->is_multithreaded && use_lock)
|
|
state->lock = PyThread_allocate_lock();
|
|
|
|
for (i = 0; i < MAX_SEARCH_POSITIONS; i++)
|
|
state->search_positions[i].start_pos = -1;
|
|
|
|
return TRUE;
|
|
|
|
error:
|
|
re_dealloc(state->group_call_guard_list);
|
|
re_dealloc(state->repeats);
|
|
dealloc_groups(state->groups, pattern->true_group_count);
|
|
re_dealloc(state->fuzzy_guards);
|
|
state->repeats = NULL;
|
|
state->groups = NULL;
|
|
state->fuzzy_guards = NULL;
|
|
return FALSE;
|
|
}
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
/* Releases the string's buffer, if necessary. */
|
|
Py_LOCAL_INLINE(void) release_buffer(RE_StringInfo* str_info) {
|
|
if (str_info->should_release)
|
|
PyBuffer_Release(&str_info->view);
|
|
}
|
|
|
|
#endif
|
|
/* Initialises a state object. */
|
|
Py_LOCAL_INLINE(BOOL) state_init(RE_State* state, PatternObject* pattern,
|
|
PyObject* string, Py_ssize_t start, Py_ssize_t end, BOOL overlapped, int
|
|
concurrent, BOOL partial, BOOL use_lock, BOOL visible_captures, BOOL
|
|
match_all) {
|
|
RE_StringInfo str_info;
|
|
|
|
/* Get the string to search or match. */
|
|
if (!get_string(string, &str_info))
|
|
return FALSE;
|
|
|
|
/* If we fail to initialise the state then we need to release the buffer if
|
|
* the string is a buffer object.
|
|
*/
|
|
if (!state_init_2(state, pattern, string, &str_info, start, end,
|
|
overlapped, concurrent, partial, use_lock, visible_captures, match_all))
|
|
{
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return FALSE;
|
|
}
|
|
|
|
/* The state has been initialised successfully, so now the state has the
|
|
* responsibility of releasing the buffer if the string is a buffer object.
|
|
*/
|
|
return TRUE;
|
|
}
|
|
|
|
/* Deallocates repeat data. */
|
|
Py_LOCAL_INLINE(void) dealloc_repeats(RE_RepeatData* repeats, size_t
|
|
repeat_count) {
|
|
size_t i;
|
|
|
|
if (!repeats)
|
|
return;
|
|
|
|
for (i = 0; i < repeat_count; i++) {
|
|
re_dealloc(repeats[i].body_guard_list.spans);
|
|
re_dealloc(repeats[i].tail_guard_list.spans);
|
|
}
|
|
|
|
re_dealloc(repeats);
|
|
}
|
|
|
|
/* Deallocates fuzzy guards. */
|
|
Py_LOCAL_INLINE(void) dealloc_fuzzy_guards(RE_FuzzyGuards* guards, size_t
|
|
fuzzy_count) {
|
|
size_t i;
|
|
|
|
if (!guards)
|
|
return;
|
|
|
|
for (i = 0; i < fuzzy_count; i++) {
|
|
re_dealloc(guards[i].body_guard_list.spans);
|
|
re_dealloc(guards[i].tail_guard_list.spans);
|
|
}
|
|
|
|
re_dealloc(guards);
|
|
}
|
|
|
|
/* Finalises a state object, discarding its contents. */
|
|
Py_LOCAL_INLINE(void) state_fini(RE_State* state) {
|
|
RE_BacktrackBlock* current;
|
|
PatternObject* pattern;
|
|
RE_SavedGroups* saved_groups;
|
|
RE_SavedRepeats* saved_repeats;
|
|
RE_GroupCallFrame* frame;
|
|
size_t i;
|
|
|
|
/* Discard the lock (mutex) if there's one. */
|
|
if (state->lock)
|
|
PyThread_free_lock(state->lock);
|
|
|
|
/* Deallocate the backtrack blocks. */
|
|
current = state->backtrack_block.next;
|
|
while (current) {
|
|
RE_BacktrackBlock* next;
|
|
|
|
next = current->next;
|
|
re_dealloc(current);
|
|
state->backtrack_allocated -= RE_BACKTRACK_BLOCK_SIZE;
|
|
current = next;
|
|
}
|
|
|
|
pattern = state->pattern;
|
|
|
|
saved_groups = state->first_saved_groups;
|
|
while (saved_groups) {
|
|
RE_SavedGroups* next;
|
|
|
|
next = saved_groups->next;
|
|
re_dealloc(saved_groups->spans);
|
|
re_dealloc(saved_groups->counts);
|
|
re_dealloc(saved_groups);
|
|
saved_groups = next;
|
|
}
|
|
|
|
saved_repeats = state->first_saved_repeats;
|
|
while (saved_repeats) {
|
|
RE_SavedRepeats* next;
|
|
|
|
next = saved_repeats->next;
|
|
|
|
dealloc_repeats(saved_repeats->repeats, pattern->repeat_count);
|
|
|
|
re_dealloc(saved_repeats);
|
|
saved_repeats = next;
|
|
}
|
|
|
|
if (pattern->groups_storage)
|
|
dealloc_groups(state->groups, pattern->true_group_count);
|
|
else
|
|
pattern->groups_storage = state->groups;
|
|
|
|
if (pattern->repeats_storage)
|
|
dealloc_repeats(state->repeats, pattern->repeat_count);
|
|
else
|
|
pattern->repeats_storage = state->repeats;
|
|
|
|
frame = state->first_group_call_frame;
|
|
while (frame) {
|
|
RE_GroupCallFrame* next;
|
|
|
|
next = frame->next;
|
|
|
|
dealloc_groups(frame->groups, pattern->true_group_count);
|
|
dealloc_repeats(frame->repeats, pattern->repeat_count);
|
|
|
|
re_dealloc(frame);
|
|
frame = next;
|
|
}
|
|
|
|
for (i = 0; i < pattern->call_ref_info_count; i++)
|
|
re_dealloc(state->group_call_guard_list[i].spans);
|
|
|
|
if (state->group_call_guard_list)
|
|
re_dealloc(state->group_call_guard_list);
|
|
|
|
if (state->fuzzy_guards)
|
|
dealloc_fuzzy_guards(state->fuzzy_guards, pattern->fuzzy_count);
|
|
|
|
Py_DECREF(state->pattern);
|
|
Py_DECREF(state->string);
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
|
|
if (state->should_release)
|
|
PyBuffer_Release(&state->view);
|
|
#endif
|
|
}
|
|
|
|
/* Converts a string index to an integer.
|
|
*
|
|
* If the index is None then the default will be returned.
|
|
*/
|
|
Py_LOCAL_INLINE(Py_ssize_t) as_string_index(PyObject* obj, Py_ssize_t def) {
|
|
Py_ssize_t value;
|
|
|
|
if (obj == Py_None)
|
|
return def;
|
|
|
|
value = PyInt_AsSsize_t(obj);
|
|
if (value != -1 || !PyErr_Occurred())
|
|
return value;
|
|
|
|
PyErr_Clear();
|
|
|
|
value = PyLong_AsLong(obj);
|
|
if (value != -1 || !PyErr_Occurred())
|
|
return value;
|
|
|
|
set_error(RE_ERROR_INDEX, NULL);
|
|
return 0;
|
|
}
|
|
|
|
/* Deallocates a MatchObject. */
|
|
static void match_dealloc(PyObject* self_) {
|
|
MatchObject* self;
|
|
|
|
self = (MatchObject*)self_;
|
|
|
|
Py_XDECREF(self->string);
|
|
Py_XDECREF(self->substring);
|
|
Py_DECREF(self->pattern);
|
|
if (self->groups)
|
|
re_dealloc(self->groups);
|
|
Py_XDECREF(self->regs);
|
|
PyObject_DEL(self);
|
|
}
|
|
|
|
/* Restricts a value to a range. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) limited_range(Py_ssize_t value, Py_ssize_t lower,
|
|
Py_ssize_t upper) {
|
|
if (value < lower)
|
|
return lower;
|
|
|
|
if (value > upper)
|
|
return upper;
|
|
|
|
return value;
|
|
}
|
|
|
|
/* Gets a slice from a Unicode string. */
|
|
Py_LOCAL_INLINE(PyObject*) unicode_slice(PyObject* string, Py_ssize_t start,
|
|
Py_ssize_t end) {
|
|
Py_ssize_t length;
|
|
Py_UNICODE* buffer;
|
|
|
|
length = PyUnicode_GET_SIZE(string);
|
|
start = limited_range(start, 0, length);
|
|
end = limited_range(end, 0, length);
|
|
|
|
buffer = PyUnicode_AsUnicode(string);
|
|
|
|
return PyUnicode_FromUnicode(buffer + start, end - start);
|
|
}
|
|
|
|
/* Gets a slice from a bytestring. */
|
|
Py_LOCAL_INLINE(PyObject*) bytes_slice(PyObject* string, Py_ssize_t start,
|
|
Py_ssize_t end) {
|
|
Py_ssize_t length;
|
|
char* buffer;
|
|
|
|
length = PyString_GET_SIZE(string);
|
|
start = limited_range(start, 0, length);
|
|
end = limited_range(end, 0, length);
|
|
|
|
buffer = PyString_AsString(string);
|
|
|
|
return PyString_FromStringAndSize(buffer + start, end - start);
|
|
}
|
|
|
|
/* Gets a slice from a string, returning either a Unicode string or a
|
|
* bytestring.
|
|
*/
|
|
Py_LOCAL_INLINE(PyObject*) get_slice(PyObject* string, Py_ssize_t start,
|
|
Py_ssize_t end) {
|
|
if (PyUnicode_Check(string))
|
|
return unicode_slice(string, start, end);
|
|
|
|
if (PyString_Check(string))
|
|
return bytes_slice(string, start, end);
|
|
|
|
return PySequence_GetSlice(string, start, end);
|
|
}
|
|
|
|
/* Gets a MatchObject's group by integer index. */
|
|
static PyObject* match_get_group_by_index(MatchObject* self, Py_ssize_t index,
|
|
PyObject* def) {
|
|
RE_GroupSpan* span;
|
|
|
|
if (index < 0 || (size_t)index > self->group_count) {
|
|
/* Raise error if we were given a bad group number. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0)
|
|
return get_slice(self->substring, self->match_start -
|
|
self->substring_offset, self->match_end - self->substring_offset);
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
span = &self->groups[index - 1].span;
|
|
|
|
if (span->start < 0 || span->end < 0) {
|
|
/* Return default value if the string or group is undefined. */
|
|
Py_INCREF(def);
|
|
return def;
|
|
}
|
|
|
|
return get_slice(self->substring, span->start - self->substring_offset,
|
|
span->end - self->substring_offset);
|
|
}
|
|
|
|
/* Gets a MatchObject's start by integer index. */
|
|
static PyObject* match_get_start_by_index(MatchObject* self, Py_ssize_t index)
|
|
{
|
|
RE_GroupSpan* span;
|
|
|
|
if (index < 0 || (size_t)index > self->group_count) {
|
|
/* Raise error if we were given a bad group number. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0)
|
|
return Py_BuildValue("n", self->match_start);
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
span = &self->groups[index - 1].span;
|
|
return Py_BuildValue("n", span->start);
|
|
}
|
|
|
|
/* Gets a MatchObject's starts by integer index. */
|
|
static PyObject* match_get_starts_by_index(MatchObject* self, Py_ssize_t index)
|
|
{
|
|
RE_GroupData* group;
|
|
PyObject* result;
|
|
PyObject* item;
|
|
size_t i;
|
|
|
|
if (index < 0 || (size_t)index > self->group_count) {
|
|
/* Raise error if we were given a bad group number. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0) {
|
|
result = PyList_New(1);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
item = Py_BuildValue("n", self->match_start);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(result, 0, item);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
group = &self->groups[index - 1];
|
|
|
|
result = PyList_New((Py_ssize_t)group->capture_count);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
for (i = 0; i < group->capture_count; i++) {
|
|
item = Py_BuildValue("n", group->captures[i].start);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(result, i, item);
|
|
}
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* Gets a MatchObject's end by integer index. */
|
|
static PyObject* match_get_end_by_index(MatchObject* self, Py_ssize_t index) {
|
|
RE_GroupSpan* span;
|
|
|
|
if (index < 0 || (size_t)index > self->group_count) {
|
|
/* Raise error if we were given a bad group number. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0)
|
|
return Py_BuildValue("n", self->match_end);
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
span = &self->groups[index - 1].span;
|
|
return Py_BuildValue("n", span->end);
|
|
}
|
|
|
|
/* Gets a MatchObject's ends by integer index. */
|
|
static PyObject* match_get_ends_by_index(MatchObject* self, Py_ssize_t index) {
|
|
RE_GroupData* group;
|
|
PyObject* result;
|
|
PyObject* item;
|
|
size_t i;
|
|
|
|
if (index < 0 || (size_t)index > self->group_count) {
|
|
/* Raise error if we were given a bad group number. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0) {
|
|
result = PyList_New(1);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
item = Py_BuildValue("n", self->match_end);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(result, 0, item);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
group = &self->groups[index - 1];
|
|
|
|
result = PyList_New((Py_ssize_t)group->capture_count);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
for (i = 0; i < group->capture_count; i++) {
|
|
item = Py_BuildValue("n", group->captures[i].end);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(result, i, item);
|
|
}
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* Gets a MatchObject's span by integer index. */
|
|
static PyObject* match_get_span_by_index(MatchObject* self, Py_ssize_t index) {
|
|
RE_GroupSpan* span;
|
|
|
|
if (index < 0 || (size_t)index > self->group_count) {
|
|
/* Raise error if we were given a bad group number. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0)
|
|
return Py_BuildValue("nn", self->match_start, self->match_end);
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
span = &self->groups[index - 1].span;
|
|
return Py_BuildValue("nn", span->start, span->end);
|
|
}
|
|
|
|
/* Gets a MatchObject's spans by integer index. */
|
|
static PyObject* match_get_spans_by_index(MatchObject* self, Py_ssize_t index)
|
|
{
|
|
RE_GroupData* group;
|
|
PyObject* result;
|
|
PyObject* item;
|
|
size_t i;
|
|
|
|
if (index < 0 || (size_t)index > self->group_count) {
|
|
/* Raise error if we were given a bad group number. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0) {
|
|
result = PyList_New(1);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
item = Py_BuildValue("nn", self->match_start, self->match_end);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(result, 0, item);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
group = &self->groups[index - 1];
|
|
|
|
result = PyList_New((Py_ssize_t)group->capture_count);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
for (i = 0; i < group->capture_count; i++) {
|
|
item = Py_BuildValue("nn", group->captures[i].start,
|
|
group->captures[i].end);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(result, i, item);
|
|
}
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* Gets a MatchObject's captures by integer index. */
|
|
static PyObject* match_get_captures_by_index(MatchObject* self, Py_ssize_t
|
|
index) {
|
|
RE_GroupData* group;
|
|
PyObject* result;
|
|
PyObject* slice;
|
|
size_t i;
|
|
|
|
if (index < 0 || (size_t)index > self->group_count) {
|
|
/* Raise error if we were given a bad group number. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0) {
|
|
result = PyList_New(1);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
slice = get_slice(self->substring, self->match_start -
|
|
self->substring_offset, self->match_end - self->substring_offset);
|
|
if (!slice)
|
|
goto error;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(result, 0, slice);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Capture group indexes are 1-based (excluding group 0, which is the
|
|
* entire matched string).
|
|
*/
|
|
group = &self->groups[index - 1];
|
|
|
|
result = PyList_New((Py_ssize_t)group->capture_count);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
for (i = 0; i < group->capture_count; i++) {
|
|
slice = get_slice(self->substring, group->captures[i].start -
|
|
self->substring_offset, group->captures[i].end -
|
|
self->substring_offset);
|
|
if (!slice)
|
|
goto error;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(result, i, slice);
|
|
}
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* Converts a group index to an integer. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) as_group_index(PyObject* obj) {
|
|
Py_ssize_t value;
|
|
|
|
value = PyInt_AsSsize_t(obj);
|
|
if (value != -1 || !PyErr_Occurred())
|
|
return value;
|
|
|
|
PyErr_Clear();
|
|
|
|
value = PyLong_AsLong(obj);
|
|
if (value != -1 || !PyErr_Occurred())
|
|
return value;
|
|
|
|
set_error(RE_ERROR_INDEX, NULL);
|
|
return -1;
|
|
}
|
|
|
|
/* Gets a MatchObject's group index.
|
|
*
|
|
* The supplied index can be an integer or a string (group name) object.
|
|
*/
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_get_group_index(MatchObject* self, PyObject*
|
|
index, BOOL allow_neg) {
|
|
Py_ssize_t group;
|
|
|
|
/* Is the index an integer? */
|
|
group = as_group_index(index);
|
|
if (group != -1 || !PyErr_Occurred()) {
|
|
Py_ssize_t min_group = 0;
|
|
|
|
/* Adjust negative indices where valid and allowed. */
|
|
if (group < 0 && allow_neg) {
|
|
group += (Py_ssize_t)self->group_count + 1;
|
|
min_group = 1;
|
|
}
|
|
|
|
if (min_group <= group && (size_t)group <= self->group_count)
|
|
return group;
|
|
|
|
return -1;
|
|
}
|
|
|
|
/* The index might be a group name. */
|
|
if (self->pattern->groupindex) {
|
|
/* Look up the name. */
|
|
PyErr_Clear();
|
|
|
|
index = PyObject_GetItem(self->pattern->groupindex, index);
|
|
if (index) {
|
|
/* Check that we have an integer. */
|
|
group = as_group_index(index);
|
|
Py_DECREF(index);
|
|
if (group != -1 || !PyErr_Occurred())
|
|
return group;
|
|
}
|
|
}
|
|
|
|
PyErr_Clear();
|
|
return -1;
|
|
}
|
|
|
|
/* Gets a MatchObject's group by object index. */
|
|
Py_LOCAL_INLINE(PyObject*) match_get_group(MatchObject* self, PyObject* index,
|
|
PyObject* def, BOOL allow_neg) {
|
|
/* Check that the index is an integer or a string. */
|
|
if (PyInt_Check(index) || PyLong_Check(index) || PyUnicode_Check(index) ||
|
|
PyString_Check(index))
|
|
return match_get_group_by_index(self, match_get_group_index(self,
|
|
index, allow_neg), def);
|
|
|
|
set_error(RE_ERROR_GROUP_INDEX_TYPE, index);
|
|
return NULL;
|
|
}
|
|
|
|
/* Gets info from a MatchObject by object index. */
|
|
Py_LOCAL_INLINE(PyObject*) get_by_arg(MatchObject* self, PyObject* index,
|
|
RE_GetByIndexFunc get_by_index) {
|
|
/* Check that the index is an integer or a string. */
|
|
if (PyInt_Check(index) || PyLong_Check(index) || PyUnicode_Check(index) ||
|
|
PyString_Check(index))
|
|
return get_by_index(self, match_get_group_index(self, index, FALSE));
|
|
|
|
set_error(RE_ERROR_GROUP_INDEX_TYPE, index);
|
|
return NULL;
|
|
}
|
|
|
|
/* MatchObject's 'group' method. */
|
|
static PyObject* match_group(MatchObject* self, PyObject* args) {
|
|
Py_ssize_t size;
|
|
PyObject* result;
|
|
Py_ssize_t i;
|
|
|
|
size = PyTuple_GET_SIZE(args);
|
|
|
|
switch (size) {
|
|
case 0:
|
|
/* group() */
|
|
result = match_get_group_by_index(self, 0, Py_None);
|
|
break;
|
|
case 1:
|
|
/* group(x). PyTuple_GET_ITEM borrows the reference. */
|
|
result = match_get_group(self, PyTuple_GET_ITEM(args, 0), Py_None,
|
|
FALSE);
|
|
break;
|
|
default:
|
|
/* group(x, y, z, ...) */
|
|
/* Fetch multiple items. */
|
|
result = PyTuple_New(size);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
for (i = 0; i < size; i++) {
|
|
PyObject* item;
|
|
|
|
/* PyTuple_GET_ITEM borrows the reference. */
|
|
item = match_get_group(self, PyTuple_GET_ITEM(args, i), Py_None,
|
|
FALSE);
|
|
if (!item) {
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* PyTuple_SET_ITEM borrows the reference. */
|
|
PyTuple_SET_ITEM(result, i, item);
|
|
}
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Generic method for getting info from a MatchObject. */
|
|
Py_LOCAL_INLINE(PyObject*) get_from_match(MatchObject* self, PyObject* args,
|
|
RE_GetByIndexFunc get_by_index) {
|
|
Py_ssize_t size;
|
|
PyObject* result;
|
|
Py_ssize_t i;
|
|
|
|
size = PyTuple_GET_SIZE(args);
|
|
|
|
switch (size) {
|
|
case 0:
|
|
/* get() */
|
|
result = get_by_index(self, 0);
|
|
break;
|
|
case 1:
|
|
/* get(x). PyTuple_GET_ITEM borrows the reference. */
|
|
result = get_by_arg(self, PyTuple_GET_ITEM(args, 0), get_by_index);
|
|
break;
|
|
default:
|
|
/* get(x, y, z, ...) */
|
|
/* Fetch multiple items. */
|
|
result = PyTuple_New(size);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
for (i = 0; i < size; i++) {
|
|
PyObject* item;
|
|
|
|
/* PyTuple_GET_ITEM borrows the reference. */
|
|
item = get_by_arg(self, PyTuple_GET_ITEM(args, i), get_by_index);
|
|
if (!item) {
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* PyTuple_SET_ITEM borrows the reference. */
|
|
PyTuple_SET_ITEM(result, i, item);
|
|
}
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* MatchObject's 'start' method. */
|
|
static PyObject* match_start(MatchObject* self, PyObject* args) {
|
|
return get_from_match(self, args, match_get_start_by_index);
|
|
}
|
|
|
|
/* MatchObject's 'starts' method. */
|
|
static PyObject* match_starts(MatchObject* self, PyObject* args) {
|
|
return get_from_match(self, args, match_get_starts_by_index);
|
|
}
|
|
|
|
/* MatchObject's 'end' method. */
|
|
static PyObject* match_end(MatchObject* self, PyObject* args) {
|
|
return get_from_match(self, args, match_get_end_by_index);
|
|
}
|
|
|
|
/* MatchObject's 'ends' method. */
|
|
static PyObject* match_ends(MatchObject* self, PyObject* args) {
|
|
return get_from_match(self, args, match_get_ends_by_index);
|
|
}
|
|
|
|
/* MatchObject's 'span' method. */
|
|
static PyObject* match_span(MatchObject* self, PyObject* args) {
|
|
return get_from_match(self, args, match_get_span_by_index);
|
|
}
|
|
|
|
/* MatchObject's 'spans' method. */
|
|
static PyObject* match_spans(MatchObject* self, PyObject* args) {
|
|
return get_from_match(self, args, match_get_spans_by_index);
|
|
}
|
|
|
|
/* MatchObject's 'captures' method. */
|
|
static PyObject* match_captures(MatchObject* self, PyObject* args) {
|
|
return get_from_match(self, args, match_get_captures_by_index);
|
|
}
|
|
|
|
/* MatchObject's 'groups' method. */
|
|
static PyObject* match_groups(MatchObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
PyObject* result;
|
|
size_t g;
|
|
|
|
PyObject* def = Py_None;
|
|
static char* kwlist[] = { "default", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O:groups", kwlist, &def))
|
|
return NULL;
|
|
|
|
result = PyTuple_New((Py_ssize_t)self->group_count);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
/* Group 0 is the entire matched portion of the string. */
|
|
for (g = 0; g < self->group_count; g++) {
|
|
PyObject* item;
|
|
|
|
item = match_get_group_by_index(self, (Py_ssize_t)g + 1, def);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyTuple_SET_ITEM borrows the reference. */
|
|
PyTuple_SET_ITEM(result, g, item);
|
|
}
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* MatchObject's 'groupdict' method. */
|
|
static PyObject* match_groupdict(MatchObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
PyObject* result;
|
|
PyObject* keys;
|
|
Py_ssize_t g;
|
|
|
|
PyObject* def = Py_None;
|
|
static char* kwlist[] = { "default", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|O:groupdict", kwlist,
|
|
&def))
|
|
return NULL;
|
|
|
|
result = PyDict_New();
|
|
if (!result || !self->pattern->groupindex)
|
|
return result;
|
|
|
|
keys = PyMapping_Keys(self->pattern->groupindex);
|
|
if (!keys)
|
|
goto failed;
|
|
|
|
for (g = 0; g < PyList_GET_SIZE(keys); g++) {
|
|
PyObject* key;
|
|
PyObject* value;
|
|
int status;
|
|
|
|
/* PyList_GET_ITEM borrows a reference. */
|
|
key = PyList_GET_ITEM(keys, g);
|
|
if (!key)
|
|
goto failed;
|
|
|
|
value = match_get_group(self, key, def, FALSE);
|
|
if (!value)
|
|
goto failed;
|
|
|
|
status = PyDict_SetItem(result, key, value);
|
|
Py_DECREF(value);
|
|
if (status < 0)
|
|
goto failed;
|
|
}
|
|
|
|
Py_DECREF(keys);
|
|
|
|
return result;
|
|
|
|
failed:
|
|
Py_XDECREF(keys);
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* MatchObject's 'capturesdict' method. */
|
|
static PyObject* match_capturesdict(MatchObject* self) {
|
|
PyObject* result;
|
|
PyObject* keys;
|
|
Py_ssize_t g;
|
|
|
|
result = PyDict_New();
|
|
if (!result || !self->pattern->groupindex)
|
|
return result;
|
|
|
|
keys = PyMapping_Keys(self->pattern->groupindex);
|
|
if (!keys)
|
|
goto failed;
|
|
|
|
for (g = 0; g < PyList_GET_SIZE(keys); g++) {
|
|
PyObject* key;
|
|
Py_ssize_t group;
|
|
PyObject* captures;
|
|
int status;
|
|
|
|
/* PyList_GET_ITEM borrows a reference. */
|
|
key = PyList_GET_ITEM(keys, g);
|
|
if (!key)
|
|
goto failed;
|
|
|
|
group = match_get_group_index(self, key, FALSE);
|
|
if (group < 0)
|
|
goto failed;
|
|
|
|
captures = match_get_captures_by_index(self, group);
|
|
if (!captures)
|
|
goto failed;
|
|
|
|
status = PyDict_SetItem(result, key, captures);
|
|
Py_DECREF(captures);
|
|
if (status < 0)
|
|
goto failed;
|
|
}
|
|
|
|
Py_DECREF(keys);
|
|
|
|
return result;
|
|
|
|
failed:
|
|
Py_XDECREF(keys);
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* Gets a Python object by name from a named module. */
|
|
Py_LOCAL_INLINE(PyObject*) get_object(char* module_name, char* object_name) {
|
|
PyObject* module;
|
|
PyObject* object;
|
|
|
|
module = PyImport_ImportModule(module_name);
|
|
if (!module)
|
|
return NULL;
|
|
|
|
object = PyObject_GetAttrString(module, object_name);
|
|
Py_DECREF(module);
|
|
|
|
return object;
|
|
}
|
|
|
|
/* Calls a function in a module. */
|
|
Py_LOCAL_INLINE(PyObject*) call(char* module_name, char* function_name,
|
|
PyObject* args) {
|
|
PyObject* function;
|
|
PyObject* result;
|
|
|
|
if (!args)
|
|
return NULL;
|
|
|
|
function = get_object(module_name, function_name);
|
|
if (!function)
|
|
return NULL;
|
|
|
|
result = PyObject_CallObject(function, args);
|
|
Py_DECREF(function);
|
|
Py_DECREF(args);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Gets a replacement item from the replacement list.
|
|
*
|
|
* The replacement item could be a string literal or a group.
|
|
*/
|
|
Py_LOCAL_INLINE(PyObject*) get_match_replacement(MatchObject* self, PyObject*
|
|
item, size_t group_count) {
|
|
Py_ssize_t index;
|
|
|
|
if (PyUnicode_Check(item) || PyString_Check(item)) {
|
|
/* It's a literal, which can be added directly to the list. */
|
|
Py_INCREF(item);
|
|
return item;
|
|
}
|
|
|
|
/* Is it a group reference? */
|
|
index = as_group_index(item);
|
|
if (index == -1 && PyErr_Occurred()) {
|
|
/* Not a group either! */
|
|
set_error(RE_ERROR_REPLACEMENT, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0) {
|
|
/* The entire matched portion of the string. */
|
|
return get_slice(self->substring, self->match_start -
|
|
self->substring_offset, self->match_end - self->substring_offset);
|
|
} else if (index >= 1 && (size_t)index <= group_count) {
|
|
/* A group. If it didn't match then return None instead. */
|
|
RE_GroupData* group;
|
|
|
|
group = &self->groups[index - 1];
|
|
|
|
if (group->capture_count > 0)
|
|
return get_slice(self->substring, group->span.start -
|
|
self->substring_offset, group->span.end -
|
|
self->substring_offset);
|
|
else {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
} else {
|
|
/* No such group. */
|
|
set_error(RE_ERROR_NO_SUCH_GROUP, NULL);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/* Initialises the join list. */
|
|
Py_LOCAL_INLINE(void) init_join_list(JoinInfo* join_info, BOOL reversed, BOOL
|
|
is_unicode) {
|
|
join_info->list = NULL;
|
|
join_info->item = NULL;
|
|
join_info->reversed = reversed;
|
|
join_info->is_unicode = is_unicode;
|
|
}
|
|
|
|
/* Adds an item to the join list. */
|
|
Py_LOCAL_INLINE(int) add_to_join_list(JoinInfo* join_info, PyObject* item) {
|
|
PyObject* new_item;
|
|
int status;
|
|
|
|
if (join_info->is_unicode) {
|
|
if (PyUnicode_Check(item)) {
|
|
new_item = item;
|
|
Py_INCREF(new_item);
|
|
} else {
|
|
new_item = PyUnicode_FromObject(item);
|
|
if (!new_item) {
|
|
set_error(RE_ERROR_NOT_UNICODE, item);
|
|
return RE_ERROR_NOT_UNICODE;
|
|
}
|
|
}
|
|
} else {
|
|
if (PyString_Check(item)) {
|
|
new_item = item;
|
|
Py_INCREF(new_item);
|
|
} else {
|
|
new_item = PyUnicode_FromObject(item);
|
|
if (!new_item) {
|
|
set_error(RE_ERROR_NOT_STRING, item);
|
|
return RE_ERROR_NOT_STRING;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* If the list already exists then just add the item to it. */
|
|
if (join_info->list) {
|
|
status = PyList_Append(join_info->list, new_item);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
Py_DECREF(new_item);
|
|
return status;
|
|
}
|
|
|
|
/* If we already have an item then we now have 2(!) and we need to put them
|
|
* into a list.
|
|
*/
|
|
if (join_info->item) {
|
|
join_info->list = PyList_New(2);
|
|
if (!join_info->list) {
|
|
status = RE_ERROR_MEMORY;
|
|
goto error;
|
|
}
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(join_info->list, 0, join_info->item);
|
|
join_info->item = NULL;
|
|
|
|
/* PyList_SET_ITEM borrows the reference. */
|
|
PyList_SET_ITEM(join_info->list, 1, new_item);
|
|
return 0;
|
|
}
|
|
|
|
/* This is the first item. */
|
|
join_info->item = new_item;
|
|
|
|
return 0;
|
|
|
|
error:
|
|
Py_DECREF(new_item);
|
|
set_error(status, NULL);
|
|
return status;
|
|
}
|
|
|
|
/* Clears the join list. */
|
|
Py_LOCAL_INLINE(void) clear_join_list(JoinInfo* join_info) {
|
|
Py_XDECREF(join_info->list);
|
|
Py_XDECREF(join_info->item);
|
|
}
|
|
|
|
/* Joins together a list of strings for pattern_subx. */
|
|
Py_LOCAL_INLINE(PyObject*) join_list_info(JoinInfo* join_info) {
|
|
/* If the list already exists then just do the join. */
|
|
if (join_info->list) {
|
|
PyObject* joiner;
|
|
PyObject* result;
|
|
|
|
if (join_info->reversed)
|
|
/* The list needs to be reversed before being joined. */
|
|
PyList_Reverse(join_info->list);
|
|
|
|
if (join_info->is_unicode) {
|
|
/* Concatenate the Unicode strings. */
|
|
joiner = PyUnicode_FromUnicode(NULL, 0);
|
|
if (!joiner) {
|
|
clear_join_list(join_info);
|
|
return NULL;
|
|
}
|
|
|
|
result = PyUnicode_Join(joiner, join_info->list);
|
|
} else {
|
|
joiner = PyString_FromString("");
|
|
if (!joiner) {
|
|
clear_join_list(join_info);
|
|
return NULL;
|
|
}
|
|
|
|
/* Concatenate the bytestrings. */
|
|
result = _PyString_Join(joiner, join_info->list);
|
|
}
|
|
|
|
Py_DECREF(joiner);
|
|
clear_join_list(join_info);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* If we have only 1 item, so we'll just return it. */
|
|
if (join_info->item)
|
|
return join_info->item;
|
|
|
|
/* There are no items, so return an empty string. */
|
|
if (join_info->is_unicode)
|
|
return PyUnicode_FromUnicode(NULL, 0);
|
|
else
|
|
return PyString_FromString("");
|
|
}
|
|
|
|
/* Checks whether a string replacement is a literal.
|
|
*
|
|
* To keep it simple we'll say that a literal is a string which can be used
|
|
* as-is.
|
|
*
|
|
* Returns its length if it is a literal, otherwise -1.
|
|
*/
|
|
Py_LOCAL_INLINE(Py_ssize_t) check_replacement_string(PyObject* str_replacement,
|
|
unsigned char special_char) {
|
|
RE_StringInfo str_info;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
Py_ssize_t pos;
|
|
|
|
if (!get_string(str_replacement, &str_info))
|
|
return -1;
|
|
|
|
switch (str_info.charsize) {
|
|
case 1:
|
|
char_at = bytes1_char_at;
|
|
break;
|
|
case 2:
|
|
char_at = bytes2_char_at;
|
|
break;
|
|
case 4:
|
|
char_at = bytes4_char_at;
|
|
break;
|
|
default:
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
#endif
|
|
return -1;
|
|
}
|
|
|
|
for (pos = 0; pos < str_info.length; pos++) {
|
|
if (char_at(str_info.characters, pos) == special_char) {
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return str_info.length;
|
|
}
|
|
|
|
/* MatchObject's 'expand' method. */
|
|
static PyObject* match_expand(MatchObject* self, PyObject* str_template) {
|
|
Py_ssize_t literal_length;
|
|
PyObject* replacement;
|
|
JoinInfo join_info;
|
|
Py_ssize_t size;
|
|
Py_ssize_t i;
|
|
|
|
/* Is the template just a literal? */
|
|
literal_length = check_replacement_string(str_template, '\\');
|
|
if (literal_length >= 0) {
|
|
/* It's a literal. */
|
|
Py_INCREF(str_template);
|
|
return str_template;
|
|
}
|
|
|
|
/* Hand the template to the template compiler. */
|
|
replacement = call(RE_MODULE, "_compile_replacement_helper",
|
|
PyTuple_Pack(2, self->pattern, str_template));
|
|
if (!replacement)
|
|
return NULL;
|
|
|
|
init_join_list(&join_info, FALSE, PyUnicode_Check(self->string));
|
|
|
|
/* Add each part of the template to the list. */
|
|
size = PyList_GET_SIZE(replacement);
|
|
for (i = 0; i < size; i++) {
|
|
PyObject* item;
|
|
PyObject* str_item;
|
|
|
|
/* PyList_GET_ITEM borrows a reference. */
|
|
item = PyList_GET_ITEM(replacement, i);
|
|
str_item = get_match_replacement(self, item, self->group_count);
|
|
if (!str_item)
|
|
goto error;
|
|
|
|
/* Add to the list. */
|
|
if (str_item == Py_None)
|
|
Py_DECREF(str_item);
|
|
else {
|
|
int status;
|
|
|
|
status = add_to_join_list(&join_info, str_item);
|
|
Py_DECREF(str_item);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
}
|
|
|
|
Py_DECREF(replacement);
|
|
|
|
/* Convert the list to a single string (also cleans up join_info). */
|
|
return join_list_info(&join_info);
|
|
|
|
error:
|
|
clear_join_list(&join_info);
|
|
Py_DECREF(replacement);
|
|
return NULL;
|
|
}
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
/* Gets a MatchObject's group dictionary. */
|
|
Py_LOCAL_INLINE(PyObject*) match_get_group_dict(MatchObject* self) {
|
|
PyObject* result;
|
|
PyObject* keys;
|
|
Py_ssize_t g;
|
|
|
|
result = PyDict_New();
|
|
if (!result || !self->pattern->groupindex)
|
|
return result;
|
|
|
|
keys = PyMapping_Keys(self->pattern->groupindex);
|
|
if (!keys)
|
|
goto failed;
|
|
|
|
for (g = 0; g < PyList_GET_SIZE(keys); g++) {
|
|
int status;
|
|
PyObject* key;
|
|
PyObject* value;
|
|
|
|
/* PyList_GET_ITEM borrows a reference. */
|
|
key = PyList_GET_ITEM(keys, g);
|
|
if (!key)
|
|
goto failed;
|
|
|
|
value = match_get_group(self, key, Py_None, FALSE);
|
|
if (!value)
|
|
goto failed;
|
|
|
|
status = PyDict_SetItem(result, key, value);
|
|
Py_DECREF(value);
|
|
if (status < 0)
|
|
goto failed;
|
|
}
|
|
|
|
Py_DECREF(keys);
|
|
|
|
return result;
|
|
|
|
failed:
|
|
Py_XDECREF(keys);
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* MatchObject's 'expandf' method. */
|
|
static PyObject* match_expandf(MatchObject* self, PyObject* str_template) {
|
|
PyObject* format_func;
|
|
PyObject* args = NULL;
|
|
size_t g;
|
|
PyObject* kwargs = NULL;
|
|
PyObject* result;
|
|
|
|
format_func = PyObject_GetAttrString(str_template, "format");
|
|
if (!format_func)
|
|
return NULL;
|
|
|
|
args = PyTuple_New((Py_ssize_t)self->group_count + 1);
|
|
if (!args)
|
|
goto error;
|
|
|
|
for (g = 0; g < self->group_count + 1; g++)
|
|
/* PyTuple_SetItem borrows the reference. */
|
|
PyTuple_SetItem(args, (Py_ssize_t)g, match_get_group_by_index(self,
|
|
(Py_ssize_t)g, Py_None));
|
|
|
|
kwargs = match_get_group_dict(self);
|
|
if (!kwargs)
|
|
goto error;
|
|
|
|
result = PyObject_Call(format_func, args, kwargs);
|
|
Py_DECREF(kwargs);
|
|
Py_DECREF(args);
|
|
Py_DECREF(format_func);
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_XDECREF(args);
|
|
Py_DECREF(format_func);
|
|
return NULL;
|
|
}
|
|
|
|
#endif
|
|
Py_LOCAL_INLINE(PyObject*) make_match_copy(MatchObject* self);
|
|
|
|
/* MatchObject's '__copy__' method. */
|
|
static PyObject* match_copy(MatchObject* self, PyObject *unused) {
|
|
return make_match_copy(self);
|
|
}
|
|
|
|
/* MatchObject's '__deepcopy__' method. */
|
|
static PyObject* match_deepcopy(MatchObject* self, PyObject* memo) {
|
|
return make_match_copy(self);
|
|
}
|
|
|
|
/* MatchObject's 'regs' attribute. */
|
|
static PyObject* match_regs(MatchObject* self) {
|
|
PyObject* regs;
|
|
PyObject* item;
|
|
size_t g;
|
|
|
|
regs = PyTuple_New((Py_ssize_t)self->group_count + 1);
|
|
if (!regs)
|
|
return NULL;
|
|
|
|
item = Py_BuildValue("nn", self->match_start, self->match_end);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyTuple_SET_ITEM borrows the reference. */
|
|
PyTuple_SET_ITEM(regs, 0, item);
|
|
|
|
for (g = 0; g < self->group_count; g++) {
|
|
RE_GroupSpan* span;
|
|
|
|
span = &self->groups[g].span;
|
|
item = Py_BuildValue("nn", span->start, span->end);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyTuple_SET_ITEM borrows the reference. */
|
|
PyTuple_SET_ITEM(regs, g + 1, item);
|
|
}
|
|
|
|
Py_INCREF(regs);
|
|
self->regs = regs;
|
|
|
|
return regs;
|
|
|
|
error:
|
|
Py_DECREF(regs);
|
|
return NULL;
|
|
}
|
|
|
|
/* MatchObject's slice method. */
|
|
Py_LOCAL_INLINE(PyObject*) match_get_group_slice(MatchObject* self, PyObject*
|
|
slice) {
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
Py_ssize_t step;
|
|
Py_ssize_t slice_length;
|
|
|
|
if (PySlice_GetIndicesEx((PySliceObject*)slice,
|
|
(Py_ssize_t)self->group_count + 1, &start, &end, &step, &slice_length) <
|
|
0)
|
|
return NULL;
|
|
|
|
if (slice_length <= 0)
|
|
return PyTuple_New(0);
|
|
else {
|
|
PyObject* result;
|
|
Py_ssize_t cur;
|
|
Py_ssize_t i;
|
|
|
|
result = PyTuple_New(slice_length);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
cur = start;
|
|
for (i = 0; i < slice_length; i++) {
|
|
/* PyTuple_SetItem borrows the reference. */
|
|
PyTuple_SetItem(result, i, match_get_group_by_index(self, cur,
|
|
Py_None));
|
|
cur += step;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}
|
|
|
|
/* MatchObject's length method. */
|
|
Py_LOCAL_INLINE(Py_ssize_t) match_length(MatchObject* self) {
|
|
return (Py_ssize_t)self->group_count + 1;
|
|
}
|
|
|
|
/* MatchObject's '__getitem__' method. */
|
|
static PyObject* match_getitem(MatchObject* self, PyObject* item) {
|
|
if (PySlice_Check(item))
|
|
return match_get_group_slice(self, item);
|
|
|
|
return match_get_group(self, item, Py_None, TRUE);
|
|
}
|
|
|
|
/* Determines the portion of the target string which is covered by the group
|
|
* captures.
|
|
*/
|
|
Py_LOCAL_INLINE(void) determine_target_substring(MatchObject* match,
|
|
Py_ssize_t* slice_start, Py_ssize_t* slice_end) {
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
size_t g;
|
|
|
|
start = match->pos;
|
|
end = match->endpos;
|
|
|
|
for (g = 0; g < match->group_count; g++) {
|
|
RE_GroupSpan* span;
|
|
size_t c;
|
|
|
|
span = &match->groups[g].span;
|
|
if (span->start >= 0 && span->start < start)
|
|
start = span->start;
|
|
if (span->end >= 0 && span->end > end)
|
|
end = span->end;
|
|
|
|
for (c = 0; c < match->groups[g].capture_count; c++) {
|
|
RE_GroupSpan* span;
|
|
|
|
span = match->groups[g].captures;
|
|
if (span->start >= 0 && span->start < start)
|
|
start = span->start;
|
|
if (span->end >= 0 && span->end > end)
|
|
end = span->end;
|
|
}
|
|
}
|
|
|
|
*slice_start = start;
|
|
*slice_end = end;
|
|
}
|
|
|
|
/* MatchObject's 'detach_string' method. */
|
|
static PyObject* match_detach_string(MatchObject* self, PyObject* unused) {
|
|
if (self->string) {
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
PyObject* substring;
|
|
|
|
determine_target_substring(self, &start, &end);
|
|
|
|
substring = get_slice(self->string, start, end);
|
|
if (substring) {
|
|
Py_XDECREF(self->substring);
|
|
self->substring = substring;
|
|
self->substring_offset = start;
|
|
|
|
Py_DECREF(self->string);
|
|
self->string = NULL;
|
|
}
|
|
}
|
|
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
|
|
/* The documentation of a MatchObject. */
|
|
PyDoc_STRVAR(match_group_doc,
|
|
"group([group1, ...]) --> string or tuple of strings.\n\
|
|
Return one or more subgroups of the match. If there is a single argument,\n\
|
|
the result is a single string, or None if the group did not contribute to\n\
|
|
the match; if there are multiple arguments, the result is a tuple with one\n\
|
|
item per argument; if there are no arguments, the whole match is returned.\n\
|
|
Group 0 is the whole match.");
|
|
|
|
PyDoc_STRVAR(match_start_doc,
|
|
"start([group1, ...]) --> int or tuple of ints.\n\
|
|
Return the index of the start of one or more subgroups of the match. If\n\
|
|
there is a single argument, the result is an index, or -1 if the group did\n\
|
|
not contribute to the match; if there are multiple arguments, the result is\n\
|
|
a tuple with one item per argument; if there are no arguments, the index of\n\
|
|
the start of the whole match is returned. Group 0 is the whole match.");
|
|
|
|
PyDoc_STRVAR(match_end_doc,
|
|
"end([group1, ...]) --> int or tuple of ints.\n\
|
|
Return the index of the end of one or more subgroups of the match. If there\n\
|
|
is a single argument, the result is an index, or -1 if the group did not\n\
|
|
contribute to the match; if there are multiple arguments, the result is a\n\
|
|
tuple with one item per argument; if there are no arguments, the index of\n\
|
|
the end of the whole match is returned. Group 0 is the whole match.");
|
|
|
|
PyDoc_STRVAR(match_span_doc,
|
|
"span([group1, ...]) --> 2-tuple of int or tuple of 2-tuple of ints.\n\
|
|
Return the span (a 2-tuple of the indices of the start and end) of one or\n\
|
|
more subgroups of the match. If there is a single argument, the result is a\n\
|
|
span, or (-1, -1) if the group did not contribute to the match; if there are\n\
|
|
multiple arguments, the result is a tuple with one item per argument; if\n\
|
|
there are no arguments, the span of the whole match is returned. Group 0 is\n\
|
|
the whole match.");
|
|
|
|
PyDoc_STRVAR(match_groups_doc,
|
|
"groups(default=None) --> tuple of strings.\n\
|
|
Return a tuple containing all the subgroups of the match. The argument is\n\
|
|
the default for groups that did not participate in the match.");
|
|
|
|
PyDoc_STRVAR(match_groupdict_doc,
|
|
"groupdict(default=None) --> dict.\n\
|
|
Return a dictionary containing all the named subgroups of the match, keyed\n\
|
|
by the subgroup name. The argument is the value to be given for groups that\n\
|
|
did not participate in the match.");
|
|
|
|
PyDoc_STRVAR(match_capturesdict_doc,
|
|
"capturesdict() --> dict.\n\
|
|
Return a dictionary containing the captures of all the named subgroups of the\n\
|
|
match, keyed by the subgroup name.");
|
|
|
|
PyDoc_STRVAR(match_expand_doc,
|
|
"expand(template) --> string.\n\
|
|
Return the string obtained by doing backslash substitution on the template,\n\
|
|
as done by the sub() method.");
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
PyDoc_STRVAR(match_expandf_doc,
|
|
"expandf(format) --> string.\n\
|
|
Return the string obtained by using the format, as done by the subf()\n\
|
|
method.");
|
|
|
|
#endif
|
|
PyDoc_STRVAR(match_captures_doc,
|
|
"captures([group1, ...]) --> list of strings or tuple of list of strings.\n\
|
|
Return the captures of one or more subgroups of the match. If there is a\n\
|
|
single argument, the result is a list of strings; if there are multiple\n\
|
|
arguments, the result is a tuple of lists with one item per argument; if\n\
|
|
there are no arguments, the captures of the whole match is returned. Group\n\
|
|
0 is the whole match.");
|
|
|
|
PyDoc_STRVAR(match_starts_doc,
|
|
"starts([group1, ...]) --> list of ints or tuple of list of ints.\n\
|
|
Return the indices of the starts of the captures of one or more subgroups of\n\
|
|
the match. If there is a single argument, the result is a list of indices;\n\
|
|
if there are multiple arguments, the result is a tuple of lists with one\n\
|
|
item per argument; if there are no arguments, the indices of the starts of\n\
|
|
the captures of the whole match is returned. Group 0 is the whole match.");
|
|
|
|
PyDoc_STRVAR(match_ends_doc,
|
|
"ends([group1, ...]) --> list of ints or tuple of list of ints.\n\
|
|
Return the indices of the ends of the captures of one or more subgroups of\n\
|
|
the match. If there is a single argument, the result is a list of indices;\n\
|
|
if there are multiple arguments, the result is a tuple of lists with one\n\
|
|
item per argument; if there are no arguments, the indices of the ends of the\n\
|
|
captures of the whole match is returned. Group 0 is the whole match.");
|
|
|
|
PyDoc_STRVAR(match_spans_doc,
|
|
"spans([group1, ...]) --> list of 2-tuple of ints or tuple of list of 2-tuple of ints.\n\
|
|
Return the spans (a 2-tuple of the indices of the start and end) of the\n\
|
|
captures of one or more subgroups of the match. If there is a single\n\
|
|
argument, the result is a list of spans; if there are multiple arguments,\n\
|
|
the result is a tuple of lists with one item per argument; if there are no\n\
|
|
arguments, the spans of the captures of the whole match is returned. Group\n\
|
|
0 is the whole match.");
|
|
|
|
PyDoc_STRVAR(match_detach_string_doc,
|
|
"detach_string()\n\
|
|
Detaches the target string from the match object. The 'string' attribute\n\
|
|
will become None.");
|
|
|
|
/* MatchObject's methods. */
|
|
static PyMethodDef match_methods[] = {
|
|
{"group", (PyCFunction)match_group, METH_VARARGS, match_group_doc},
|
|
{"start", (PyCFunction)match_start, METH_VARARGS, match_start_doc},
|
|
{"end", (PyCFunction)match_end, METH_VARARGS, match_end_doc},
|
|
{"span", (PyCFunction)match_span, METH_VARARGS, match_span_doc},
|
|
{"groups", (PyCFunction)match_groups, METH_VARARGS|METH_KEYWORDS,
|
|
match_groups_doc},
|
|
{"groupdict", (PyCFunction)match_groupdict, METH_VARARGS|METH_KEYWORDS,
|
|
match_groupdict_doc},
|
|
{"capturesdict", (PyCFunction)match_capturesdict, METH_NOARGS,
|
|
match_capturesdict_doc},
|
|
{"expand", (PyCFunction)match_expand, METH_O, match_expand_doc},
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
{"expandf", (PyCFunction)match_expandf, METH_O, match_expandf_doc},
|
|
#endif
|
|
{"captures", (PyCFunction)match_captures, METH_VARARGS,
|
|
match_captures_doc},
|
|
{"starts", (PyCFunction)match_starts, METH_VARARGS, match_starts_doc},
|
|
{"ends", (PyCFunction)match_ends, METH_VARARGS, match_ends_doc},
|
|
{"spans", (PyCFunction)match_spans, METH_VARARGS, match_spans_doc},
|
|
{"detach_string", (PyCFunction)match_detach_string, METH_NOARGS,
|
|
match_detach_string_doc},
|
|
{"__copy__", (PyCFunction)match_copy, METH_NOARGS},
|
|
{"__deepcopy__", (PyCFunction)match_deepcopy, METH_O},
|
|
{"__getitem__", (PyCFunction)match_getitem, METH_O|METH_COEXIST},
|
|
{NULL, NULL}
|
|
};
|
|
|
|
PyDoc_STRVAR(match_doc, "Match object");
|
|
|
|
/* MatchObject's 'lastindex' attribute. */
|
|
static PyObject* match_lastindex(PyObject* self_) {
|
|
MatchObject* self;
|
|
|
|
self = (MatchObject*)self_;
|
|
|
|
if (self->lastindex >= 0)
|
|
return Py_BuildValue("n", self->lastindex);
|
|
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
|
|
/* MatchObject's 'lastgroup' attribute. */
|
|
static PyObject* match_lastgroup(PyObject* self_) {
|
|
MatchObject* self;
|
|
|
|
self = (MatchObject*)self_;
|
|
|
|
if (self->pattern->indexgroup && self->lastgroup >= 0) {
|
|
PyObject* index;
|
|
PyObject* result;
|
|
|
|
index = Py_BuildValue("n", self->lastgroup);
|
|
|
|
/* PyDict_GetItem returns borrows a reference. */
|
|
result = PyDict_GetItem(self->pattern->indexgroup, index);
|
|
Py_DECREF(index);
|
|
if (result) {
|
|
Py_INCREF(result);
|
|
return result;
|
|
}
|
|
PyErr_Clear();
|
|
}
|
|
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
|
|
/* MatchObject's 'string' attribute. */
|
|
static PyObject* match_string(PyObject* self_) {
|
|
MatchObject* self;
|
|
|
|
self = (MatchObject*)self_;
|
|
|
|
if (self->string) {
|
|
Py_INCREF(self->string);
|
|
return self->string;
|
|
} else {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
}
|
|
#if PY_VERSION_HEX < 0x02060000
|
|
|
|
/* MatchObject's 'partial' attribute. */
|
|
static PyObject* match_partial(PyObject* self_) {
|
|
MatchObject* self;
|
|
PyObject* result;
|
|
|
|
self = (MatchObject*)self_;
|
|
|
|
result = self->partial ? Py_True : Py_False;
|
|
Py_INCREF(result);
|
|
|
|
return result;
|
|
}
|
|
#endif
|
|
|
|
/* MatchObject's 'fuzzy_counts' attribute. */
|
|
static PyObject* match_fuzzy_counts(PyObject* self_) {
|
|
MatchObject* self;
|
|
|
|
self = (MatchObject*)self_;
|
|
|
|
return Py_BuildValue("nnn", self->fuzzy_counts[RE_FUZZY_SUB],
|
|
self->fuzzy_counts[RE_FUZZY_INS], self->fuzzy_counts[RE_FUZZY_DEL]);
|
|
}
|
|
|
|
static PyGetSetDef match_getset[] = {
|
|
{"lastindex", (getter)match_lastindex, (setter)NULL,
|
|
"The group number of the last matched capturing group, or None."},
|
|
{"lastgroup", (getter)match_lastgroup, (setter)NULL,
|
|
"The name of the last matched capturing group, or None."},
|
|
{"regs", (getter)match_regs, (setter)NULL,
|
|
"A tuple of the spans of the capturing groups."},
|
|
{"string", (getter)match_string, (setter)NULL,
|
|
"The string that was searched, or None if it has been detached."},
|
|
#if PY_VERSION_HEX < 0x02060000
|
|
{"partial", (getter)match_partial, (setter)NULL,
|
|
"Whether it's a partial match."},
|
|
#endif
|
|
{"fuzzy_counts", (getter)match_fuzzy_counts, (setter)NULL,
|
|
"A tuple of the number of substitutions, insertions and deletions."},
|
|
{NULL} /* Sentinel */
|
|
};
|
|
|
|
static PyMemberDef match_members[] = {
|
|
{"re", T_OBJECT, offsetof(MatchObject, pattern), READONLY,
|
|
"The regex object that produced this match object."},
|
|
{"pos", T_PYSSIZET, offsetof(MatchObject, pos), READONLY,
|
|
"The position at which the regex engine starting searching."},
|
|
{"endpos", T_PYSSIZET, offsetof(MatchObject, endpos), READONLY,
|
|
"The final position beyond which the regex engine won't search."},
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
{"partial", T_BOOL, offsetof(MatchObject, partial), READONLY,
|
|
"Whether it's a partial match."},
|
|
#endif
|
|
{NULL} /* Sentinel */
|
|
};
|
|
|
|
static PyMappingMethods match_as_mapping = {
|
|
(lenfunc)match_length, /* mp_length */
|
|
(binaryfunc)match_getitem, /* mp_subscript */
|
|
0, /* mp_ass_subscript */
|
|
};
|
|
|
|
static PyTypeObject Match_Type = {
|
|
PyObject_HEAD_INIT(NULL)
|
|
0,
|
|
"_" RE_MODULE "." "Match",
|
|
sizeof(MatchObject)
|
|
};
|
|
|
|
/* Copies the groups. */
|
|
Py_LOCAL_INLINE(RE_GroupData*) copy_groups(RE_GroupData* groups, size_t
|
|
group_count) {
|
|
size_t span_count;
|
|
size_t g;
|
|
RE_GroupData* groups_copy;
|
|
RE_GroupSpan* spans_copy;
|
|
size_t offset;
|
|
|
|
/* Calculate the total size of the group info. */
|
|
span_count = 0;
|
|
for (g = 0; g < group_count; g++)
|
|
span_count += groups[g].capture_count;
|
|
|
|
/* Allocate the storage for the group info in a single block. */
|
|
groups_copy = (RE_GroupData*)re_alloc(group_count * sizeof(RE_GroupData) +
|
|
span_count * sizeof(RE_GroupSpan));
|
|
if (!groups_copy)
|
|
return NULL;
|
|
|
|
/* The storage for the spans comes after the other group info. */
|
|
spans_copy = (RE_GroupSpan*)&groups_copy[group_count];
|
|
|
|
/* There's no need to initialise the spans info. */
|
|
memset(groups_copy, 0, group_count * sizeof(RE_GroupData));
|
|
|
|
offset = 0;
|
|
for (g = 0; g < group_count; g++) {
|
|
RE_GroupData* orig;
|
|
RE_GroupData* copy;
|
|
|
|
orig = &groups[g];
|
|
copy = &groups_copy[g];
|
|
copy->span = orig->span;
|
|
|
|
copy->captures = &spans_copy[offset];
|
|
offset += orig->capture_count;
|
|
|
|
if (orig->capture_count > 0) {
|
|
Py_MEMCPY(copy->captures, orig->captures, orig->capture_count *
|
|
sizeof(RE_GroupSpan));
|
|
copy->capture_capacity = orig->capture_count;
|
|
copy->capture_count = orig->capture_count;
|
|
}
|
|
}
|
|
|
|
return groups_copy;
|
|
}
|
|
|
|
/* Makes a copy of a MatchObject. */
|
|
Py_LOCAL_INLINE(PyObject*) make_match_copy(MatchObject* self) {
|
|
MatchObject* match;
|
|
|
|
if (!self->string) {
|
|
/* The target string has been detached, so the MatchObject is now
|
|
* immutable.
|
|
*/
|
|
Py_INCREF(self);
|
|
return (PyObject*)self;
|
|
}
|
|
|
|
/* Create a MatchObject. */
|
|
match = PyObject_NEW(MatchObject, &Match_Type);
|
|
if (!match)
|
|
return NULL;
|
|
|
|
Py_MEMCPY(match, self, sizeof(MatchObject));
|
|
|
|
Py_INCREF(match->string);
|
|
Py_INCREF(match->substring);
|
|
Py_INCREF(match->pattern);
|
|
|
|
/* Copy the groups to the MatchObject. */
|
|
if (self->group_count > 0) {
|
|
match->groups = copy_groups(self->groups, self->group_count);
|
|
if (!match->groups) {
|
|
Py_DECREF(match);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
return (PyObject*)match;
|
|
}
|
|
|
|
/* Creates a new MatchObject. */
|
|
Py_LOCAL_INLINE(PyObject*) pattern_new_match(PatternObject* pattern, RE_State*
|
|
state, int status) {
|
|
/* Create MatchObject (from state object). */
|
|
if (status > 0 || status == RE_ERROR_PARTIAL) {
|
|
MatchObject* match;
|
|
|
|
/* Create a MatchObject. */
|
|
match = PyObject_NEW(MatchObject, &Match_Type);
|
|
if (!match)
|
|
return NULL;
|
|
|
|
match->string = state->string;
|
|
match->substring = state->string;
|
|
match->substring_offset = 0;
|
|
match->pattern = pattern;
|
|
match->regs = NULL;
|
|
match->fuzzy_counts[RE_FUZZY_SUB] =
|
|
state->total_fuzzy_counts[RE_FUZZY_SUB];
|
|
match->fuzzy_counts[RE_FUZZY_INS] =
|
|
state->total_fuzzy_counts[RE_FUZZY_INS];
|
|
match->fuzzy_counts[RE_FUZZY_DEL] =
|
|
state->total_fuzzy_counts[RE_FUZZY_DEL];
|
|
match->partial = status == RE_ERROR_PARTIAL;
|
|
Py_INCREF(match->string);
|
|
Py_INCREF(match->substring);
|
|
Py_INCREF(match->pattern);
|
|
|
|
/* Copy the groups to the MatchObject. */
|
|
if (pattern->public_group_count > 0) {
|
|
match->groups = copy_groups(state->groups,
|
|
pattern->public_group_count);
|
|
if (!match->groups) {
|
|
Py_DECREF(match);
|
|
return NULL;
|
|
}
|
|
} else
|
|
match->groups = NULL;
|
|
|
|
match->group_count = pattern->public_group_count;
|
|
|
|
match->pos = state->slice_start;
|
|
match->endpos = state->slice_end;
|
|
|
|
if (state->reverse) {
|
|
match->match_start = state->text_pos;
|
|
match->match_end = state->match_pos;
|
|
} else {
|
|
match->match_start = state->match_pos;
|
|
match->match_end = state->text_pos;
|
|
}
|
|
|
|
match->lastindex = state->lastindex;
|
|
match->lastgroup = state->lastgroup;
|
|
|
|
return (PyObject*)match;
|
|
} else if (status == 0) {
|
|
/* No match. */
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
} else {
|
|
/* Internal error. */
|
|
set_error(status, NULL);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/* Gets the text of a capture group from a state. */
|
|
Py_LOCAL_INLINE(PyObject*) state_get_group(RE_State* state, Py_ssize_t index,
|
|
PyObject* string, BOOL empty) {
|
|
RE_GroupData* group;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
|
|
group = &state->groups[index - 1];
|
|
|
|
if (string != Py_None && index >= 1 && (size_t)index <=
|
|
state->pattern->public_group_count && group->capture_count > 0) {
|
|
start = group->span.start;
|
|
end = group->span.end;
|
|
} else {
|
|
if (empty)
|
|
/* Want an empty string. */
|
|
start = end = 0;
|
|
else {
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
}
|
|
|
|
return get_slice(string, start, end);
|
|
}
|
|
|
|
/* Acquires the lock (mutex) on the state if there's one.
|
|
*
|
|
* It also increments the owner's refcount just to ensure that it won't be
|
|
* destroyed by another thread.
|
|
*/
|
|
Py_LOCAL_INLINE(void) acquire_state_lock(PyObject* owner, RE_SafeState*
|
|
safe_state) {
|
|
RE_State* state;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
if (state->lock) {
|
|
/* In order to avoid deadlock we need to release the GIL while trying
|
|
* to acquire the lock.
|
|
*/
|
|
Py_INCREF(owner);
|
|
if (!PyThread_acquire_lock(state->lock, 0)) {
|
|
release_GIL(safe_state);
|
|
PyThread_acquire_lock(state->lock, 1);
|
|
acquire_GIL(safe_state);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Releases the lock (mutex) on the state if there's one.
|
|
*
|
|
* It also decrements the owner's refcount, which was incremented when the lock
|
|
* was acquired.
|
|
*/
|
|
Py_LOCAL_INLINE(void) release_state_lock(PyObject* owner, RE_SafeState*
|
|
safe_state) {
|
|
RE_State* state;
|
|
|
|
state = safe_state->re_state;
|
|
|
|
if (state->lock) {
|
|
PyThread_release_lock(state->lock);
|
|
Py_DECREF(owner);
|
|
}
|
|
}
|
|
|
|
/* Implements the functionality of ScanObject's search and match methods. */
|
|
Py_LOCAL_INLINE(PyObject*) scanner_search_or_match(ScannerObject* self, BOOL
|
|
search) {
|
|
RE_State* state;
|
|
RE_SafeState safe_state;
|
|
PyObject* match;
|
|
|
|
state = &self->state;
|
|
|
|
/* Initialise the "safe state" structure. */
|
|
safe_state.re_state = state;
|
|
safe_state.thread_state = NULL;
|
|
|
|
/* Acquire the state lock in case we're sharing the scanner object across
|
|
* threads.
|
|
*/
|
|
acquire_state_lock((PyObject*)self, &safe_state);
|
|
|
|
if (self->status == RE_ERROR_FAILURE || self->status == RE_ERROR_PARTIAL) {
|
|
/* No or partial match. */
|
|
release_state_lock((PyObject*)self, &safe_state);
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
} else if (self->status < 0) {
|
|
/* Internal error. */
|
|
release_state_lock((PyObject*)self, &safe_state);
|
|
set_error(self->status, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
/* Look for another match. */
|
|
self->status = do_match(&safe_state, search);
|
|
if (self->status >= 0 || self->status == RE_ERROR_PARTIAL) {
|
|
/* Create the match object. */
|
|
match = pattern_new_match(self->pattern, state, self->status);
|
|
|
|
if (search && state->overlapped) {
|
|
/* Advance one character. */
|
|
Py_ssize_t step;
|
|
|
|
step = state->reverse ? -1 : 1;
|
|
state->text_pos = state->match_pos + step;
|
|
state->must_advance = FALSE;
|
|
} else
|
|
/* Continue from where we left off, but don't allow 2 contiguous
|
|
* zero-width matches.
|
|
*/
|
|
state->must_advance = state->text_pos == state->match_pos;
|
|
} else
|
|
/* Internal error. */
|
|
match = NULL;
|
|
|
|
/* Release the state lock. */
|
|
release_state_lock((PyObject*)self, &safe_state);
|
|
|
|
return match;
|
|
}
|
|
|
|
/* ScannerObject's 'match' method. */
|
|
static PyObject* scanner_match(ScannerObject* self, PyObject* unused) {
|
|
return scanner_search_or_match(self, FALSE);
|
|
}
|
|
|
|
/* ScannerObject's 'search' method. */
|
|
static PyObject* scanner_search(ScannerObject* self, PyObject *unused) {
|
|
return scanner_search_or_match(self, TRUE);
|
|
}
|
|
|
|
/* ScannerObject's 'next' method. */
|
|
static PyObject* scanner_next(PyObject* self) {
|
|
PyObject* match;
|
|
|
|
match = scanner_search((ScannerObject*)self, NULL);
|
|
|
|
if (match == Py_None) {
|
|
/* No match. */
|
|
Py_DECREF(Py_None);
|
|
PyErr_SetNone(PyExc_StopIteration);
|
|
return NULL;
|
|
}
|
|
|
|
return match;
|
|
}
|
|
|
|
/* Returns an iterator for a ScannerObject.
|
|
*
|
|
* The iterator is actually the ScannerObject itself.
|
|
*/
|
|
static PyObject* scanner_iter(PyObject* self) {
|
|
Py_INCREF(self);
|
|
return self;
|
|
}
|
|
|
|
/* Gets the next result from a scanner iterator. */
|
|
static PyObject* scanner_iternext(PyObject* self) {
|
|
PyObject* match;
|
|
|
|
match = scanner_search((ScannerObject*)self, NULL);
|
|
|
|
if (match == Py_None) {
|
|
/* No match. */
|
|
Py_DECREF(match);
|
|
return NULL;
|
|
}
|
|
|
|
return match;
|
|
}
|
|
|
|
/* Makes a copy of a ScannerObject.
|
|
*
|
|
* It actually doesn't make a copy, just returns the original object.
|
|
*/
|
|
Py_LOCAL_INLINE(PyObject*) make_scanner_copy(ScannerObject* self) {
|
|
Py_INCREF(self);
|
|
return (PyObject*)self;
|
|
}
|
|
|
|
/* ScannerObject's '__copy__' method. */
|
|
static PyObject* scanner_copy(ScannerObject* self, PyObject *unused) {
|
|
return make_scanner_copy(self);
|
|
}
|
|
|
|
/* ScannerObject's '__deepcopy__' method. */
|
|
static PyObject* scanner_deepcopy(ScannerObject* self, PyObject* memo) {
|
|
return make_scanner_copy(self);
|
|
}
|
|
|
|
/* The documentation of a ScannerObject. */
|
|
PyDoc_STRVAR(scanner_match_doc,
|
|
"match() --> MatchObject or None.\n\
|
|
Match at the current position in the string.");
|
|
|
|
PyDoc_STRVAR(scanner_search_doc,
|
|
"search() --> MatchObject or None.\n\
|
|
Search from the current position in the string.");
|
|
|
|
/* ScannerObject's methods. */
|
|
static PyMethodDef scanner_methods[] = {
|
|
{"next", (PyCFunction)scanner_next, METH_NOARGS},
|
|
{"match", (PyCFunction)scanner_match, METH_NOARGS, scanner_match_doc},
|
|
{"search", (PyCFunction)scanner_search, METH_NOARGS, scanner_search_doc},
|
|
{"__copy__", (PyCFunction)scanner_copy, METH_NOARGS},
|
|
{"__deepcopy__", (PyCFunction)scanner_deepcopy, METH_O},
|
|
{NULL, NULL}
|
|
};
|
|
|
|
PyDoc_STRVAR(scanner_doc, "Scanner object");
|
|
|
|
/* Deallocates a ScannerObject. */
|
|
static void scanner_dealloc(PyObject* self_) {
|
|
ScannerObject* self;
|
|
|
|
self = (ScannerObject*)self_;
|
|
|
|
state_fini(&self->state);
|
|
Py_DECREF(self->pattern);
|
|
PyObject_DEL(self);
|
|
}
|
|
|
|
static PyMemberDef scanner_members[] = {
|
|
{"pattern", T_OBJECT, offsetof(ScannerObject, pattern), READONLY,
|
|
"The regex object that produced this scanner object."},
|
|
{NULL} /* Sentinel */
|
|
};
|
|
|
|
static PyTypeObject Scanner_Type = {
|
|
PyObject_HEAD_INIT(NULL)
|
|
0,
|
|
"_" RE_MODULE "." "Scanner",
|
|
sizeof(ScannerObject)
|
|
};
|
|
|
|
/* Decodes a 'concurrent' argument. */
|
|
Py_LOCAL_INLINE(int) decode_concurrent(PyObject* concurrent) {
|
|
Py_ssize_t value;
|
|
|
|
if (concurrent == Py_None)
|
|
return RE_CONC_DEFAULT;
|
|
|
|
value = PyLong_AsLong(concurrent);
|
|
if (value == -1 && PyErr_Occurred()) {
|
|
set_error(RE_ERROR_CONCURRENT, NULL);
|
|
return -1;
|
|
}
|
|
|
|
return value ? RE_CONC_YES : RE_CONC_NO;
|
|
}
|
|
|
|
/* Decodes a 'partial' argument. */
|
|
Py_LOCAL_INLINE(BOOL) decode_partial(PyObject* partial) {
|
|
Py_ssize_t value;
|
|
|
|
if (partial == Py_False)
|
|
return FALSE;
|
|
|
|
if (partial == Py_True)
|
|
return TRUE;
|
|
|
|
value = PyLong_AsLong(partial);
|
|
if (value == -1 && PyErr_Occurred()) {
|
|
PyErr_Clear();
|
|
return TRUE;
|
|
}
|
|
|
|
return value != 0;
|
|
}
|
|
|
|
/* Creates a new ScannerObject. */
|
|
static PyObject* pattern_scanner(PatternObject* pattern, PyObject* args,
|
|
PyObject* kwargs) {
|
|
/* Create search state object. */
|
|
ScannerObject* self;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
int conc;
|
|
BOOL part;
|
|
|
|
PyObject* string;
|
|
PyObject* pos = Py_None;
|
|
PyObject* endpos = Py_None;
|
|
Py_ssize_t overlapped = FALSE;
|
|
PyObject* concurrent = Py_None;
|
|
PyObject* partial = Py_False;
|
|
static char* kwlist[] = { "string", "pos", "endpos", "overlapped",
|
|
"concurrent", "partial", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOnOO:scanner", kwlist,
|
|
&string, &pos, &endpos, &overlapped, &concurrent, &partial))
|
|
return NULL;
|
|
|
|
start = as_string_index(pos, 0);
|
|
if (start == -1 && PyErr_Occurred())
|
|
return NULL;
|
|
|
|
end = as_string_index(endpos, PY_SSIZE_T_MAX);
|
|
if (end == -1 && PyErr_Occurred())
|
|
return NULL;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
part = decode_partial(partial);
|
|
|
|
/* Create a scanner object. */
|
|
self = PyObject_NEW(ScannerObject, &Scanner_Type);
|
|
if (!self)
|
|
return NULL;
|
|
|
|
self->pattern = pattern;
|
|
Py_INCREF(self->pattern);
|
|
|
|
/* The MatchObject, and therefore repeated captures, will be visible. */
|
|
if (!state_init(&self->state, pattern, string, start, end, overlapped != 0,
|
|
conc, part, TRUE, TRUE, FALSE)) {
|
|
PyObject_DEL(self);
|
|
return NULL;
|
|
}
|
|
|
|
self->status = RE_ERROR_SUCCESS;
|
|
|
|
return (PyObject*) self;
|
|
}
|
|
|
|
/* Performs the split for the SplitterObject. */
|
|
Py_LOCAL_INLINE(PyObject*) next_split_part(SplitterObject* self) {
|
|
RE_State* state;
|
|
RE_SafeState safe_state;
|
|
PyObject* result = NULL; /* Initialise to stop compiler warning. */
|
|
|
|
state = &self->state;
|
|
|
|
/* Initialise the "safe state" structure. */
|
|
safe_state.re_state = state;
|
|
safe_state.thread_state = NULL;
|
|
|
|
/* Acquire the state lock in case we're sharing the splitter object across
|
|
* threads.
|
|
*/
|
|
acquire_state_lock((PyObject*)self, &safe_state);
|
|
|
|
if (self->status == RE_ERROR_FAILURE || self->status == RE_ERROR_PARTIAL) {
|
|
/* Finished. */
|
|
release_state_lock((PyObject*)self, &safe_state);
|
|
result = Py_False;
|
|
Py_INCREF(result);
|
|
return result;
|
|
} else if (self->status < 0) {
|
|
/* Internal error. */
|
|
release_state_lock((PyObject*)self, &safe_state);
|
|
set_error(self->status, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (self->index == 0) {
|
|
if (self->split_count < self->maxsplit) {
|
|
Py_ssize_t step;
|
|
Py_ssize_t end_pos;
|
|
|
|
if (state->reverse) {
|
|
step = -1;
|
|
end_pos = state->slice_start;
|
|
} else {
|
|
step = 1;
|
|
end_pos = state->slice_end;
|
|
}
|
|
|
|
retry:
|
|
self->status = do_match(&safe_state, TRUE);
|
|
if (self->status < 0)
|
|
goto error;
|
|
|
|
if (self->status == RE_ERROR_SUCCESS) {
|
|
if (state->version_0) {
|
|
/* Version 0 behaviour is to advance one character if the
|
|
* split was zero-width. Unfortunately, this can give an
|
|
* incorrect result. GvR wants this behaviour to be
|
|
* retained so as not to break any existing software which
|
|
* might rely on it.
|
|
*/
|
|
if (state->text_pos == state->match_pos) {
|
|
if (self->last_pos == end_pos)
|
|
goto no_match;
|
|
|
|
/* Advance one character. */
|
|
state->text_pos += step;
|
|
state->must_advance = FALSE;
|
|
goto retry;
|
|
}
|
|
}
|
|
|
|
++self->split_count;
|
|
|
|
/* Get segment before this match. */
|
|
if (state->reverse)
|
|
result = get_slice(state->string, state->match_pos,
|
|
self->last_pos);
|
|
else
|
|
result = get_slice(state->string, self->last_pos,
|
|
state->match_pos);
|
|
if (!result)
|
|
goto error;
|
|
|
|
self->last_pos = state->text_pos;
|
|
|
|
/* Version 0 behaviour is to advance one character if the match
|
|
* was zero-width. Unfortunately, this can give an incorrect
|
|
* result. GvR wants this behaviour to be retained so as not to
|
|
* break any existing software which might rely on it.
|
|
*/
|
|
if (state->version_0) {
|
|
if (state->text_pos == state->match_pos)
|
|
/* Advance one character. */
|
|
state->text_pos += step;
|
|
|
|
state->must_advance = FALSE;
|
|
} else
|
|
/* Continue from where we left off, but don't allow a
|
|
* contiguous zero-width match.
|
|
*/
|
|
state->must_advance = TRUE;
|
|
}
|
|
} else
|
|
goto no_match;
|
|
|
|
if (self->status == RE_ERROR_FAILURE || self->status ==
|
|
RE_ERROR_PARTIAL) {
|
|
no_match:
|
|
/* Get segment following last match (even if empty). */
|
|
if (state->reverse)
|
|
result = get_slice(state->string, 0, self->last_pos);
|
|
else
|
|
result = get_slice(state->string, self->last_pos,
|
|
state->text_length);
|
|
if (!result)
|
|
goto error;
|
|
}
|
|
} else {
|
|
/* Add group. */
|
|
result = state_get_group(state, self->index, state->string, FALSE);
|
|
if (!result)
|
|
goto error;
|
|
}
|
|
|
|
++self->index;
|
|
if ((size_t)self->index > state->pattern->public_group_count)
|
|
self->index = 0;
|
|
|
|
/* Release the state lock. */
|
|
release_state_lock((PyObject*)self, &safe_state);
|
|
|
|
return result;
|
|
|
|
error:
|
|
/* Release the state lock. */
|
|
release_state_lock((PyObject*)self, &safe_state);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
/* SplitterObject's 'split' method. */
|
|
static PyObject* splitter_split(SplitterObject* self, PyObject *unused) {
|
|
PyObject* result;
|
|
|
|
result = next_split_part(self);
|
|
|
|
if (result == Py_False) {
|
|
/* The sentinel. */
|
|
Py_DECREF(Py_False);
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* SplitterObject's 'next' method. */
|
|
static PyObject* splitter_next(PyObject* self) {
|
|
PyObject* result;
|
|
|
|
result = next_split_part((SplitterObject*)self);
|
|
|
|
if (result == Py_False) {
|
|
/* No match. */
|
|
Py_DECREF(Py_False);
|
|
PyErr_SetNone(PyExc_StopIteration);
|
|
return NULL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Returns an iterator for a SplitterObject.
|
|
*
|
|
* The iterator is actually the SplitterObject itself.
|
|
*/
|
|
static PyObject* splitter_iter(PyObject* self) {
|
|
Py_INCREF(self);
|
|
return self;
|
|
}
|
|
|
|
/* Gets the next result from a splitter iterator. */
|
|
static PyObject* splitter_iternext(PyObject* self) {
|
|
PyObject* result;
|
|
|
|
result = next_split_part((SplitterObject*)self);
|
|
|
|
if (result == Py_False) {
|
|
/* No match. */
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Makes a copy of a SplitterObject.
|
|
*
|
|
* It actually doesn't make a copy, just returns the original object.
|
|
*/
|
|
Py_LOCAL_INLINE(PyObject*) make_splitter_copy(SplitterObject* self) {
|
|
Py_INCREF(self);
|
|
return (PyObject*)self;
|
|
}
|
|
|
|
/* SplitterObject's '__copy__' method. */
|
|
static PyObject* splitter_copy(SplitterObject* self, PyObject *unused) {
|
|
return make_splitter_copy(self);
|
|
}
|
|
|
|
/* SplitterObject's '__deepcopy__' method. */
|
|
static PyObject* splitter_deepcopy(SplitterObject* self, PyObject* memo) {
|
|
return make_splitter_copy(self);
|
|
}
|
|
|
|
/* The documentation of a SplitterObject. */
|
|
PyDoc_STRVAR(splitter_split_doc,
|
|
"split() --> string or None.\n\
|
|
Return the next part of the split string.");
|
|
|
|
/* SplitterObject's methods. */
|
|
static PyMethodDef splitter_methods[] = {
|
|
{"next", (PyCFunction)splitter_next, METH_NOARGS},
|
|
{"split", (PyCFunction)splitter_split, METH_NOARGS, splitter_split_doc},
|
|
{"__copy__", (PyCFunction)splitter_copy, METH_NOARGS},
|
|
{"__deepcopy__", (PyCFunction)splitter_deepcopy, METH_O},
|
|
{NULL, NULL}
|
|
};
|
|
|
|
PyDoc_STRVAR(splitter_doc, "Splitter object");
|
|
|
|
/* Deallocates a SplitterObject. */
|
|
static void splitter_dealloc(PyObject* self_) {
|
|
SplitterObject* self;
|
|
|
|
self = (SplitterObject*)self_;
|
|
|
|
state_fini(&self->state);
|
|
Py_DECREF(self->pattern);
|
|
PyObject_DEL(self);
|
|
}
|
|
|
|
static PyMemberDef splitter_members[] = {
|
|
{"pattern", T_OBJECT, offsetof(SplitterObject, pattern), READONLY,
|
|
"The regex object that produced this splitter object."},
|
|
{NULL} /* Sentinel */
|
|
};
|
|
|
|
static PyTypeObject Splitter_Type = {
|
|
PyObject_HEAD_INIT(NULL)
|
|
0,
|
|
"_" RE_MODULE "." "Splitter",
|
|
sizeof(SplitterObject)
|
|
};
|
|
|
|
/* Creates a new SplitterObject. */
|
|
Py_LOCAL_INLINE(PyObject*) pattern_splitter(PatternObject* pattern, PyObject*
|
|
args, PyObject* kwargs) {
|
|
/* Create split state object. */
|
|
int conc;
|
|
SplitterObject* self;
|
|
RE_State* state;
|
|
|
|
PyObject* string;
|
|
Py_ssize_t maxsplit = 0;
|
|
PyObject* concurrent = Py_None;
|
|
static char* kwlist[] = { "string", "maxsplit", "concurrent", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nO:splitter", kwlist,
|
|
&string, &maxsplit, &concurrent))
|
|
return NULL;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
/* Create a splitter object. */
|
|
self = PyObject_NEW(SplitterObject, &Splitter_Type);
|
|
if (!self)
|
|
return NULL;
|
|
|
|
self->pattern = pattern;
|
|
Py_INCREF(self->pattern);
|
|
|
|
if (maxsplit == 0)
|
|
maxsplit = PY_SSIZE_T_MAX;
|
|
|
|
state = &self->state;
|
|
|
|
/* The MatchObject, and therefore repeated captures, will not be visible.
|
|
*/
|
|
if (!state_init(state, pattern, string, 0, PY_SSIZE_T_MAX, FALSE, conc,
|
|
FALSE, TRUE, FALSE, FALSE)) {
|
|
PyObject_DEL(self);
|
|
return NULL;
|
|
}
|
|
|
|
self->maxsplit = maxsplit;
|
|
self->last_pos = state->reverse ? state->text_length : 0;
|
|
self->split_count = 0;
|
|
self->index = 0;
|
|
self->status = 1;
|
|
|
|
return (PyObject*) self;
|
|
}
|
|
|
|
/* Implements the functionality of PatternObject's search and match methods. */
|
|
Py_LOCAL_INLINE(PyObject*) pattern_search_or_match(PatternObject* self,
|
|
PyObject* args, PyObject* kwargs, char* args_desc, BOOL search, BOOL
|
|
match_all) {
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
int conc;
|
|
BOOL part;
|
|
RE_State state;
|
|
RE_SafeState safe_state;
|
|
int status;
|
|
PyObject* match;
|
|
|
|
PyObject* string;
|
|
PyObject* pos = Py_None;
|
|
PyObject* endpos = Py_None;
|
|
PyObject* concurrent = Py_None;
|
|
PyObject* partial = Py_False;
|
|
static char* kwlist[] = { "string", "pos", "endpos", "concurrent",
|
|
"partial", NULL };
|
|
/* When working with a short string, such as a line from a file, the
|
|
* relative cost of PyArg_ParseTupleAndKeywords can be significant, and
|
|
* it's worth not using it when there are only positional arguments.
|
|
*/
|
|
Py_ssize_t arg_count;
|
|
if (args && !kwargs && PyTuple_CheckExact(args))
|
|
arg_count = PyTuple_GET_SIZE(args);
|
|
else
|
|
arg_count = -1;
|
|
|
|
if (1 <= arg_count && arg_count <= 5) {
|
|
/* PyTuple_GET_ITEM borrows the reference. */
|
|
string = PyTuple_GET_ITEM(args, 0);
|
|
if (arg_count >= 2)
|
|
pos = PyTuple_GET_ITEM(args, 1);
|
|
if (arg_count >= 3)
|
|
endpos = PyTuple_GET_ITEM(args, 2);
|
|
if (arg_count >= 4)
|
|
concurrent = PyTuple_GET_ITEM(args, 3);
|
|
if (arg_count >= 5)
|
|
partial = PyTuple_GET_ITEM(args, 4);
|
|
} else if (!PyArg_ParseTupleAndKeywords(args, kwargs, args_desc, kwlist,
|
|
&string, &pos, &endpos, &concurrent, &partial))
|
|
return NULL;
|
|
|
|
start = as_string_index(pos, 0);
|
|
if (start == -1 && PyErr_Occurred())
|
|
return NULL;
|
|
|
|
end = as_string_index(endpos, PY_SSIZE_T_MAX);
|
|
if (end == -1 && PyErr_Occurred())
|
|
return NULL;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
part = decode_partial(partial);
|
|
|
|
/* The MatchObject, and therefore repeated captures, will be visible. */
|
|
if (!state_init(&state, self, string, start, end, FALSE, conc, part, FALSE,
|
|
TRUE, match_all))
|
|
return NULL;
|
|
|
|
/* Initialise the "safe state" structure. */
|
|
safe_state.re_state = &state;
|
|
safe_state.thread_state = NULL;
|
|
|
|
status = do_match(&safe_state, search);
|
|
|
|
if (status >= 0 || status == RE_ERROR_PARTIAL)
|
|
/* Create the match object. */
|
|
match = pattern_new_match(self, &state, status);
|
|
else
|
|
match = NULL;
|
|
|
|
state_fini(&state);
|
|
|
|
return match;
|
|
}
|
|
|
|
/* PatternObject's 'match' method. */
|
|
static PyObject* pattern_match(PatternObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
return pattern_search_or_match(self, args, kwargs, "O|OOOO:match", FALSE,
|
|
FALSE);
|
|
}
|
|
|
|
/* PatternObject's 'fullmatch' method. */
|
|
static PyObject* pattern_fullmatch(PatternObject* self, PyObject* args,
|
|
PyObject* kwargs) {
|
|
return pattern_search_or_match(self, args, kwargs, "O|OOOO:fullmatch",
|
|
FALSE, TRUE);
|
|
}
|
|
|
|
/* PatternObject's 'search' method. */
|
|
static PyObject* pattern_search(PatternObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
return pattern_search_or_match(self, args, kwargs, "O|OOOO:search", TRUE,
|
|
FALSE);
|
|
}
|
|
|
|
/* Gets the limits of the matching. */
|
|
Py_LOCAL_INLINE(BOOL) get_limits(PyObject* pos, PyObject* endpos, Py_ssize_t
|
|
length, Py_ssize_t* start, Py_ssize_t* end) {
|
|
Py_ssize_t s;
|
|
Py_ssize_t e;
|
|
|
|
s = as_string_index(pos, 0);
|
|
if (s == -1 && PyErr_Occurred())
|
|
return FALSE;
|
|
|
|
e = as_string_index(endpos, PY_SSIZE_T_MAX);
|
|
if (e == -1 && PyErr_Occurred())
|
|
return FALSE;
|
|
|
|
/* Adjust boundaries. */
|
|
if (s < 0)
|
|
s += length;
|
|
if (s < 0)
|
|
s = 0;
|
|
else if (s > length)
|
|
s = length;
|
|
|
|
if (e < 0)
|
|
e += length;
|
|
if (e < 0)
|
|
e = 0;
|
|
else if (e > length)
|
|
e = length;
|
|
|
|
*start = s;
|
|
*end = e;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Gets a replacement item from the replacement list.
|
|
*
|
|
* The replacement item could be a string literal or a group.
|
|
*
|
|
* It can return None to represent an empty string.
|
|
*/
|
|
Py_LOCAL_INLINE(PyObject*) get_sub_replacement(PyObject* item, PyObject*
|
|
string, RE_State* state, size_t group_count) {
|
|
Py_ssize_t index;
|
|
|
|
if (PyUnicode_CheckExact(item) || PyString_CheckExact(item)) {
|
|
/* It's a literal, which can be added directly to the list. */
|
|
Py_INCREF(item);
|
|
return item;
|
|
}
|
|
|
|
/* Is it a group reference? */
|
|
index = as_group_index(item);
|
|
if (index == -1 && PyErr_Occurred()) {
|
|
/* Not a group either! */
|
|
set_error(RE_ERROR_REPLACEMENT, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
if (index == 0) {
|
|
/* The entire matched portion of the string. */
|
|
if (state->match_pos == state->text_pos) {
|
|
/* Return None for "". */
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
|
|
if (state->reverse)
|
|
return get_slice(string, state->text_pos, state->match_pos);
|
|
else
|
|
return get_slice(string, state->match_pos, state->text_pos);
|
|
} else if (1 <= index && (size_t)index <= group_count) {
|
|
/* A group. */
|
|
RE_GroupData* group;
|
|
|
|
group = &state->groups[index - 1];
|
|
|
|
if (group->capture_count == 0 && group->span.start != group->span.end)
|
|
{
|
|
/* The group didn't match or is "", so return None for "". */
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
}
|
|
|
|
return get_slice(string, group->span.start, group->span.end);
|
|
} else {
|
|
/* No such group. */
|
|
set_error(RE_ERROR_INVALID_GROUP_REF, NULL);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/* PatternObject's 'subx' method. */
|
|
Py_LOCAL_INLINE(PyObject*) pattern_subx(PatternObject* self, PyObject*
|
|
str_template, PyObject* string, Py_ssize_t maxsub, int sub_type, PyObject*
|
|
pos, PyObject* endpos, int concurrent) {
|
|
RE_StringInfo str_info;
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
BOOL is_callable = FALSE;
|
|
BOOL is_literal = FALSE;
|
|
BOOL is_template = FALSE;
|
|
PyObject* replacement = NULL;
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
BOOL is_format = FALSE;
|
|
#endif
|
|
RE_State state;
|
|
RE_SafeState safe_state;
|
|
JoinInfo join_info;
|
|
Py_ssize_t sub_count;
|
|
Py_ssize_t last_pos;
|
|
PyObject* item;
|
|
Py_ssize_t end_pos;
|
|
Py_ssize_t step;
|
|
|
|
/* Get the string. */
|
|
if (!get_string(string, &str_info))
|
|
return NULL;
|
|
|
|
/* Get the limits of the search. */
|
|
if (!get_limits(pos, endpos, str_info.length, &start, &end)) {
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return NULL;
|
|
}
|
|
|
|
/* If the pattern is too long for the string, then take a shortcut, unless
|
|
* it's a fuzzy pattern.
|
|
*/
|
|
if (!self->is_fuzzy && self->min_width > end - start) {
|
|
PyObject* result;
|
|
|
|
Py_INCREF(string);
|
|
|
|
if (sub_type & RE_SUBN)
|
|
result = Py_BuildValue("Nn", string, 0);
|
|
else
|
|
result = string;
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return result;
|
|
}
|
|
|
|
if (maxsub == 0)
|
|
maxsub = PY_SSIZE_T_MAX;
|
|
|
|
/* sub/subn takes either a function or a string template. */
|
|
if (PyCallable_Check(str_template)) {
|
|
/* It's callable. */
|
|
is_callable = TRUE;
|
|
|
|
replacement = str_template;
|
|
Py_INCREF(replacement);
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
} else if (sub_type & RE_SUBF) {
|
|
/* Is it a literal format?
|
|
*
|
|
* To keep it simple we'll say that a literal is a string which can be
|
|
* used as-is, so no placeholders.
|
|
*/
|
|
Py_ssize_t literal_length;
|
|
|
|
literal_length = check_replacement_string(str_template, '{');
|
|
if (literal_length > 0) {
|
|
/* It's a literal. */
|
|
is_literal = TRUE;
|
|
|
|
replacement = str_template;
|
|
Py_INCREF(replacement);
|
|
} else if (literal_length < 0) {
|
|
/* It isn't a literal, so get the 'format' method. */
|
|
is_format = TRUE;
|
|
|
|
replacement = PyObject_GetAttrString(str_template, "format");
|
|
if (!replacement) {
|
|
release_buffer(&str_info);
|
|
return NULL;
|
|
}
|
|
}
|
|
#endif
|
|
} else {
|
|
/* Is it a literal template?
|
|
*
|
|
* To keep it simple we'll say that a literal is a string which can be
|
|
* used as-is, so no backslashes.
|
|
*/
|
|
Py_ssize_t literal_length;
|
|
|
|
literal_length = check_replacement_string(str_template, '\\');
|
|
if (literal_length > 0) {
|
|
/* It's a literal. */
|
|
is_literal = TRUE;
|
|
|
|
replacement = str_template;
|
|
Py_INCREF(replacement);
|
|
} else if (literal_length < 0 ) {
|
|
/* It isn't a literal, so hand it over to the template compiler. */
|
|
is_template = TRUE;
|
|
|
|
replacement = call(RE_MODULE, "_compile_replacement_helper",
|
|
PyTuple_Pack(2, self, str_template));
|
|
if (!replacement) {
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* The MatchObject, and therefore repeated captures, will be visible only
|
|
* if the replacement is callable.
|
|
*/
|
|
if (!state_init_2(&state, self, string, &str_info, start, end, FALSE,
|
|
concurrent, FALSE, FALSE, is_callable, FALSE)) {
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
Py_XDECREF(replacement);
|
|
return NULL;
|
|
}
|
|
|
|
/* Initialise the "safe state" structure. */
|
|
safe_state.re_state = &state;
|
|
safe_state.thread_state = NULL;
|
|
|
|
init_join_list(&join_info, state.reverse, PyUnicode_Check(string));
|
|
|
|
sub_count = 0;
|
|
last_pos = state.reverse ? state.text_length : 0;
|
|
step = state.reverse ? -1 : 1;
|
|
while (sub_count < maxsub) {
|
|
int status;
|
|
|
|
status = do_match(&safe_state, TRUE);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
if (status == 0)
|
|
break;
|
|
|
|
/* Append the segment before this match. */
|
|
if (state.match_pos != last_pos) {
|
|
if (state.reverse)
|
|
item = get_slice(string, state.match_pos, last_pos);
|
|
else
|
|
item = get_slice(string, last_pos, state.match_pos);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* Add to the list. */
|
|
status = add_to_join_list(&join_info, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
|
|
/* Add this match. */
|
|
if (is_literal) {
|
|
/* The replacement is a literal string. */
|
|
status = add_to_join_list(&join_info, replacement);
|
|
if (status < 0)
|
|
goto error;
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
} else if (is_format) {
|
|
/* The replacement is a format string. */
|
|
MatchObject* match;
|
|
PyObject* args;
|
|
size_t g;
|
|
PyObject* kwargs;
|
|
|
|
/* We need to create the arguments for the 'format' method. We'll
|
|
* start by creating a MatchObject.
|
|
*/
|
|
match = (MatchObject*)pattern_new_match(self, &state, 1);
|
|
if (!match)
|
|
goto error;
|
|
|
|
/* The args are a tuple of the capture group matches. */
|
|
args = PyTuple_New((Py_ssize_t)state.pattern->public_group_count +
|
|
1);
|
|
if (!args) {
|
|
Py_DECREF(match);
|
|
goto error;
|
|
}
|
|
|
|
for (g = 0; g < state.pattern->public_group_count + 1; g++)
|
|
/* PyTuple_SetItem borrows the reference. */
|
|
PyTuple_SetItem(args, (Py_ssize_t)g,
|
|
match_get_group_by_index(match, (Py_ssize_t)g, Py_None));
|
|
|
|
/* The kwargs are a dict of the named capture group matches. */
|
|
kwargs = match_get_group_dict(match);
|
|
if (!kwargs) {
|
|
Py_DECREF(args);
|
|
Py_DECREF(match);
|
|
goto error;
|
|
}
|
|
|
|
/* Call the 'format' method. */
|
|
item = PyObject_Call(replacement, args, kwargs);
|
|
Py_DECREF(kwargs);
|
|
Py_DECREF(args);
|
|
Py_DECREF(match);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* Add the result to the list. */
|
|
status = add_to_join_list(&join_info, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
#endif
|
|
} else if (is_template) {
|
|
/* The replacement is a list template. */
|
|
Py_ssize_t size;
|
|
Py_ssize_t i;
|
|
|
|
/* Add each part of the template to the list. */
|
|
size = PyList_GET_SIZE(replacement);
|
|
for (i = 0; i < size; i++) {
|
|
PyObject* item;
|
|
PyObject* str_item;
|
|
|
|
/* PyList_GET_ITEM borrows a reference. */
|
|
item = PyList_GET_ITEM(replacement, i);
|
|
str_item = get_sub_replacement(item, string, &state,
|
|
self->public_group_count);
|
|
if (!str_item)
|
|
goto error;
|
|
|
|
/* Add the result to the list. */
|
|
if (str_item == Py_None)
|
|
/* None for "". */
|
|
Py_DECREF(str_item);
|
|
else {
|
|
status = add_to_join_list(&join_info, str_item);
|
|
Py_DECREF(str_item);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
}
|
|
} else if (is_callable) {
|
|
/* Pass a MatchObject to the replacement function. */
|
|
PyObject* match;
|
|
PyObject* args;
|
|
|
|
/* We need to create a MatchObject to pass to the replacement
|
|
* function.
|
|
*/
|
|
match = pattern_new_match(self, &state, 1);
|
|
if (!match)
|
|
goto error;
|
|
|
|
/* The args for the replacement function. */
|
|
args = PyTuple_Pack(1, match);
|
|
if (!args) {
|
|
Py_DECREF(match);
|
|
goto error;
|
|
}
|
|
|
|
/* Call the replacement function. */
|
|
item = PyObject_CallObject(replacement, args);
|
|
Py_DECREF(args);
|
|
Py_DECREF(match);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* Add the result to the list. */
|
|
status = add_to_join_list(&join_info, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
|
|
++sub_count;
|
|
|
|
last_pos = state.text_pos;
|
|
|
|
if (state.version_0) {
|
|
/* Always advance after a zero-width match. */
|
|
if (state.match_pos == state.text_pos) {
|
|
state.text_pos += step;
|
|
state.must_advance = FALSE;
|
|
} else
|
|
state.must_advance = TRUE;
|
|
} else
|
|
/* Continue from where we left off, but don't allow a contiguous
|
|
* zero-width match.
|
|
*/
|
|
state.must_advance = state.match_pos == state.text_pos;
|
|
}
|
|
|
|
/* Get the segment following the last match. We use 'length' instead of
|
|
* 'text_length' because the latter is truncated to 'slice_end', a
|
|
* documented idiosyncracy of the 're' module.
|
|
*/
|
|
end_pos = state.reverse ? 0 : str_info.length;
|
|
if (last_pos != end_pos) {
|
|
int status;
|
|
|
|
/* The segment is part of the original string. */
|
|
if (state.reverse)
|
|
item = get_slice(string, 0, last_pos);
|
|
else
|
|
item = get_slice(string, last_pos, str_info.length);
|
|
if (!item)
|
|
goto error;
|
|
|
|
status = add_to_join_list(&join_info, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
|
|
Py_XDECREF(replacement);
|
|
|
|
/* Convert the list to a single string (also cleans up join_info). */
|
|
item = join_list_info(&join_info);
|
|
|
|
state_fini(&state);
|
|
|
|
if (!item)
|
|
return NULL;
|
|
|
|
if (sub_type & RE_SUBN)
|
|
return Py_BuildValue("Nn", item, sub_count);
|
|
|
|
return item;
|
|
|
|
error:
|
|
clear_join_list(&join_info);
|
|
state_fini(&state);
|
|
Py_XDECREF(replacement);
|
|
return NULL;
|
|
}
|
|
|
|
/* PatternObject's 'sub' method. */
|
|
static PyObject* pattern_sub(PatternObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
int conc;
|
|
|
|
PyObject* replacement;
|
|
PyObject* string;
|
|
Py_ssize_t count = 0;
|
|
PyObject* pos = Py_None;
|
|
PyObject* endpos = Py_None;
|
|
PyObject* concurrent = Py_None;
|
|
static char* kwlist[] = { "repl", "string", "count", "pos", "endpos",
|
|
"concurrent", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nOOO:sub", kwlist,
|
|
&replacement, &string, &count, &pos, &endpos, &concurrent))
|
|
return NULL;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
return pattern_subx(self, replacement, string, count, RE_SUB, pos, endpos,
|
|
conc);
|
|
}
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
/* PatternObject's 'subf' method. */
|
|
static PyObject* pattern_subf(PatternObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
int conc;
|
|
|
|
PyObject* format;
|
|
PyObject* string;
|
|
Py_ssize_t count = 0;
|
|
PyObject* pos = Py_None;
|
|
PyObject* endpos = Py_None;
|
|
PyObject* concurrent = Py_None;
|
|
static char* kwlist[] = { "format", "string", "count", "pos", "endpos",
|
|
"concurrent", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nOOO:sub", kwlist,
|
|
&format, &string, &count, &pos, &endpos, &concurrent))
|
|
return NULL;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
return pattern_subx(self, format, string, count, RE_SUBF, pos, endpos,
|
|
conc);
|
|
}
|
|
|
|
#endif
|
|
/* PatternObject's 'subn' method. */
|
|
static PyObject* pattern_subn(PatternObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
int conc;
|
|
|
|
PyObject* replacement;
|
|
PyObject* string;
|
|
Py_ssize_t count = 0;
|
|
PyObject* pos = Py_None;
|
|
PyObject* endpos = Py_None;
|
|
PyObject* concurrent = Py_None;
|
|
static char* kwlist[] = { "repl", "string", "count", "pos", "endpos",
|
|
"concurrent", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nOOO:subn", kwlist,
|
|
&replacement, &string, &count, &pos, &endpos, &concurrent))
|
|
return NULL;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
return pattern_subx(self, replacement, string, count, RE_SUBN, pos, endpos,
|
|
conc);
|
|
}
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
/* PatternObject's 'subfn' method. */
|
|
static PyObject* pattern_subfn(PatternObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
int conc;
|
|
|
|
PyObject* format;
|
|
PyObject* string;
|
|
Py_ssize_t count = 0;
|
|
PyObject* pos = Py_None;
|
|
PyObject* endpos = Py_None;
|
|
PyObject* concurrent = Py_None;
|
|
static char* kwlist[] = { "format", "string", "count", "pos", "endpos",
|
|
"concurrent", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|nOOO:subn", kwlist,
|
|
&format, &string, &count, &pos, &endpos, &concurrent))
|
|
return NULL;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
return pattern_subx(self, format, string, count, RE_SUBF | RE_SUBN, pos,
|
|
endpos, conc);
|
|
}
|
|
|
|
#endif
|
|
/* PatternObject's 'split' method. */
|
|
static PyObject* pattern_split(PatternObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
int conc;
|
|
|
|
RE_State state;
|
|
RE_SafeState safe_state;
|
|
PyObject* list;
|
|
PyObject* item;
|
|
int status;
|
|
Py_ssize_t split_count;
|
|
size_t g;
|
|
Py_ssize_t start_pos;
|
|
Py_ssize_t end_pos;
|
|
Py_ssize_t step;
|
|
Py_ssize_t last_pos;
|
|
|
|
PyObject* string;
|
|
Py_ssize_t maxsplit = 0;
|
|
PyObject* concurrent = Py_None;
|
|
static char* kwlist[] = { "string", "maxsplit", "concurrent", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|nO:split", kwlist,
|
|
&string, &maxsplit, &concurrent))
|
|
return NULL;
|
|
|
|
if (maxsplit == 0)
|
|
maxsplit = PY_SSIZE_T_MAX;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
/* The MatchObject, and therefore repeated captures, will not be visible.
|
|
*/
|
|
if (!state_init(&state, self, string, 0, PY_SSIZE_T_MAX, FALSE, conc,
|
|
FALSE, FALSE, FALSE, FALSE))
|
|
return NULL;
|
|
|
|
/* Initialise the "safe state" structure. */
|
|
safe_state.re_state = &state;
|
|
safe_state.thread_state = NULL;
|
|
|
|
list = PyList_New(0);
|
|
if (!list) {
|
|
state_fini(&state);
|
|
return NULL;
|
|
}
|
|
|
|
split_count = 0;
|
|
if (state.reverse) {
|
|
start_pos = state.text_length;
|
|
end_pos = 0;
|
|
step = -1;
|
|
} else {
|
|
start_pos = 0;
|
|
end_pos = state.text_length;
|
|
step = 1;
|
|
}
|
|
|
|
last_pos = start_pos;
|
|
while (split_count < maxsplit) {
|
|
status = do_match(&safe_state, TRUE);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
if (status == 0)
|
|
/* No more matches. */
|
|
break;
|
|
|
|
if (state.version_0) {
|
|
/* Version 0 behaviour is to advance one character if the split was
|
|
* zero-width. Unfortunately, this can give an incorrect result.
|
|
* GvR wants this behaviour to be retained so as not to break any
|
|
* existing software which might rely on it.
|
|
*/
|
|
if (state.text_pos == state.match_pos) {
|
|
if (last_pos == end_pos)
|
|
break;
|
|
|
|
/* Advance one character. */
|
|
state.text_pos += step;
|
|
state.must_advance = FALSE;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* Get segment before this match. */
|
|
if (state.reverse)
|
|
item = get_slice(string, state.match_pos, last_pos);
|
|
else
|
|
item = get_slice(string, last_pos, state.match_pos);
|
|
if (!item)
|
|
goto error;
|
|
|
|
status = PyList_Append(list, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
/* Add groups (if any). */
|
|
for (g = 1; g <= self->public_group_count; g++) {
|
|
item = state_get_group(&state, (Py_ssize_t)g, string, FALSE);
|
|
if (!item)
|
|
goto error;
|
|
|
|
status = PyList_Append(list, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
|
|
++split_count;
|
|
last_pos = state.text_pos;
|
|
|
|
/* Version 0 behaviour is to advance one character if the match was
|
|
* zero-width. Unfortunately, this can give an incorrect result. GvR
|
|
* wants this behaviour to be retained so as not to break any existing
|
|
* software which might rely on it.
|
|
*/
|
|
if (state.version_0) {
|
|
if (state.text_pos == state.match_pos)
|
|
/* Advance one character. */
|
|
state.text_pos += step;
|
|
|
|
state.must_advance = FALSE;
|
|
} else
|
|
/* Continue from where we left off, but don't allow a contiguous
|
|
* zero-width match.
|
|
*/
|
|
state.must_advance = TRUE;
|
|
}
|
|
|
|
/* Get segment following last match (even if empty). */
|
|
if (state.reverse)
|
|
item = get_slice(string, 0, last_pos);
|
|
else
|
|
item = get_slice(string, last_pos, state.text_length);
|
|
if (!item)
|
|
goto error;
|
|
|
|
status = PyList_Append(list, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
state_fini(&state);
|
|
|
|
return list;
|
|
|
|
error:
|
|
Py_DECREF(list);
|
|
state_fini(&state);
|
|
return NULL;
|
|
}
|
|
|
|
/* PatternObject's 'splititer' method. */
|
|
static PyObject* pattern_splititer(PatternObject* pattern, PyObject* args,
|
|
PyObject* kwargs) {
|
|
return pattern_splitter(pattern, args, kwargs);
|
|
}
|
|
|
|
/* PatternObject's 'findall' method. */
|
|
static PyObject* pattern_findall(PatternObject* self, PyObject* args, PyObject*
|
|
kwargs) {
|
|
Py_ssize_t start;
|
|
Py_ssize_t end;
|
|
RE_State state;
|
|
int conc;
|
|
RE_SafeState safe_state;
|
|
PyObject* list;
|
|
Py_ssize_t step;
|
|
int status;
|
|
size_t g;
|
|
Py_ssize_t b;
|
|
Py_ssize_t e;
|
|
|
|
PyObject* string;
|
|
PyObject* pos = Py_None;
|
|
PyObject* endpos = Py_None;
|
|
Py_ssize_t overlapped = FALSE;
|
|
PyObject* concurrent = Py_None;
|
|
static char* kwlist[] = { "string", "pos", "endpos", "overlapped",
|
|
"concurrent", NULL };
|
|
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOnO:findall", kwlist,
|
|
&string, &pos, &endpos, &overlapped, &concurrent))
|
|
return NULL;
|
|
|
|
start = as_string_index(pos, 0);
|
|
if (start == -1 && PyErr_Occurred())
|
|
return NULL;
|
|
|
|
end = as_string_index(endpos, PY_SSIZE_T_MAX);
|
|
if (end == -1 && PyErr_Occurred())
|
|
return NULL;
|
|
|
|
conc = decode_concurrent(concurrent);
|
|
if (conc < 0)
|
|
return NULL;
|
|
|
|
/* The MatchObject, and therefore repeated captures, will not be visible.
|
|
*/
|
|
if (!state_init(&state, self, string, start, end, overlapped != 0, conc,
|
|
FALSE, FALSE, FALSE, FALSE))
|
|
return NULL;
|
|
|
|
/* Initialise the "safe state" structure. */
|
|
safe_state.re_state = &state;
|
|
safe_state.thread_state = NULL;
|
|
|
|
list = PyList_New(0);
|
|
if (!list) {
|
|
state_fini(&state);
|
|
return NULL;
|
|
}
|
|
|
|
step = state.reverse ? -1 : 1;
|
|
while (state.slice_start <= state.text_pos && state.text_pos <=
|
|
state.slice_end) {
|
|
PyObject* item;
|
|
|
|
status = do_match(&safe_state, TRUE);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
if (status == 0)
|
|
break;
|
|
|
|
/* Don't bother to build a MatchObject. */
|
|
switch (self->public_group_count) {
|
|
case 0:
|
|
if (state.reverse) {
|
|
b = state.text_pos;
|
|
e = state.match_pos;
|
|
} else {
|
|
b = state.match_pos;
|
|
e = state.text_pos;
|
|
}
|
|
item = get_slice(string, b, e);
|
|
if (!item)
|
|
goto error;
|
|
break;
|
|
case 1:
|
|
item = state_get_group(&state, 1, string, TRUE);
|
|
if (!item)
|
|
goto error;
|
|
break;
|
|
default:
|
|
item = PyTuple_New((Py_ssize_t)self->public_group_count);
|
|
if (!item)
|
|
goto error;
|
|
|
|
for (g = 0; g < self->public_group_count; g++) {
|
|
PyObject* o;
|
|
|
|
o = state_get_group(&state, (Py_ssize_t)g + 1, string, TRUE);
|
|
if (!o) {
|
|
Py_DECREF(item);
|
|
goto error;
|
|
}
|
|
|
|
/* PyTuple_SET_ITEM borrows the reference. */
|
|
PyTuple_SET_ITEM(item, g, o);
|
|
}
|
|
break;
|
|
}
|
|
|
|
status = PyList_Append(list, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
if (state.overlapped) {
|
|
/* Advance one character. */
|
|
state.text_pos = state.match_pos + step;
|
|
state.must_advance = FALSE;
|
|
} else
|
|
/* Continue from where we left off, but don't allow 2 contiguous
|
|
* zero-width matches.
|
|
*/
|
|
state.must_advance = state.text_pos == state.match_pos;
|
|
}
|
|
|
|
state_fini(&state);
|
|
|
|
return list;
|
|
|
|
error:
|
|
Py_DECREF(list);
|
|
state_fini(&state);
|
|
return NULL;
|
|
}
|
|
|
|
/* PatternObject's 'finditer' method. */
|
|
static PyObject* pattern_finditer(PatternObject* pattern, PyObject* args,
|
|
PyObject* kwargs) {
|
|
return pattern_scanner(pattern, args, kwargs);
|
|
}
|
|
|
|
/* Makes a copy of a PatternObject. */
|
|
Py_LOCAL_INLINE(PyObject*) make_pattern_copy(PatternObject* self) {
|
|
Py_INCREF(self);
|
|
return (PyObject*)self;
|
|
}
|
|
|
|
/* PatternObject's '__copy__' method. */
|
|
static PyObject* pattern_copy(PatternObject* self, PyObject *unused) {
|
|
return make_pattern_copy(self);
|
|
}
|
|
|
|
/* PatternObject's '__deepcopy__' method. */
|
|
static PyObject* pattern_deepcopy(PatternObject* self, PyObject* memo) {
|
|
return make_pattern_copy(self);
|
|
}
|
|
|
|
/* The documentation of a PatternObject. */
|
|
PyDoc_STRVAR(pattern_match_doc,
|
|
"match(string, pos=None, endpos=None, concurrent=None) --> MatchObject or None.\n\
|
|
Match zero or more characters at the beginning of the string.");
|
|
|
|
PyDoc_STRVAR(pattern_fullmatch_doc,
|
|
"fullmatch(string, pos=None, endpos=None, concurrent=None) --> MatchObject or None.\n\
|
|
Match zero or more characters against all of the string.");
|
|
|
|
PyDoc_STRVAR(pattern_search_doc,
|
|
"search(string, pos=None, endpos=None, concurrent=None) --> MatchObject or None.\n\
|
|
Search through string looking for a match, and return a corresponding\n\
|
|
match object instance. Return None if no match is found.");
|
|
|
|
PyDoc_STRVAR(pattern_sub_doc,
|
|
"sub(repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None) --> newstring\n\
|
|
Return the string obtained by replacing the leftmost (or rightmost with a\n\
|
|
reverse pattern) non-overlapping occurrences of pattern in string by the\n\
|
|
replacement repl.");
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
PyDoc_STRVAR(pattern_subf_doc,
|
|
"subf(format, string, count=0, flags=0, pos=None, endpos=None, concurrent=None) --> newstring\n\
|
|
Return the string obtained by replacing the leftmost (or rightmost with a\n\
|
|
reverse pattern) non-overlapping occurrences of pattern in string by the\n\
|
|
replacement format.");
|
|
|
|
#endif
|
|
PyDoc_STRVAR(pattern_subn_doc,
|
|
"subn(repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None) --> (newstring, number of subs)\n\
|
|
Return the tuple (new_string, number_of_subs_made) found by replacing the\n\
|
|
leftmost (or rightmost with a reverse pattern) non-overlapping occurrences\n\
|
|
of pattern with the replacement repl.");
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
PyDoc_STRVAR(pattern_subfn_doc,
|
|
"subfn(format, string, count=0, flags=0, pos=None, endpos=None, concurrent=None) --> (newstring, number of subs)\n\
|
|
Return the tuple (new_string, number_of_subs_made) found by replacing the\n\
|
|
leftmost (or rightmost with a reverse pattern) non-overlapping occurrences\n\
|
|
of pattern with the replacement format.");
|
|
|
|
#endif
|
|
PyDoc_STRVAR(pattern_split_doc,
|
|
"split(string, string, maxsplit=0, concurrent=None) --> list.\n\
|
|
Split string by the occurrences of pattern.");
|
|
|
|
PyDoc_STRVAR(pattern_splititer_doc,
|
|
"splititer(string, maxsplit=0, concurrent=None) --> iterator.\n\
|
|
Return an iterator yielding the parts of a split string.");
|
|
|
|
PyDoc_STRVAR(pattern_findall_doc,
|
|
"findall(string, pos=None, endpos=None, overlapped=False, concurrent=None) --> list.\n\
|
|
Return a list of all matches of pattern in string. The matches may be\n\
|
|
overlapped if overlapped is True.");
|
|
|
|
PyDoc_STRVAR(pattern_finditer_doc,
|
|
"finditer(string, pos=None, endpos=None, overlapped=False, concurrent=None) --> iterator.\n\
|
|
Return an iterator over all matches for the RE pattern in string. The\n\
|
|
matches may be overlapped if overlapped is True. For each match, the\n\
|
|
iterator returns a MatchObject.");
|
|
|
|
PyDoc_STRVAR(pattern_scanner_doc,
|
|
"scanner(string, pos=None, endpos=None, overlapped=False, concurrent=None) --> scanner.\n\
|
|
Return an scanner for the RE pattern in string. The matches may be overlapped\n\
|
|
if overlapped is True.");
|
|
|
|
/* The methods of a PatternObject. */
|
|
static PyMethodDef pattern_methods[] = {
|
|
{"match", (PyCFunction)pattern_match, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_match_doc},
|
|
{"fullmatch", (PyCFunction)pattern_fullmatch, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_fullmatch_doc},
|
|
{"search", (PyCFunction)pattern_search, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_search_doc},
|
|
{"sub", (PyCFunction)pattern_sub, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_sub_doc},
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
{"subf", (PyCFunction)pattern_subf, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_subf_doc},
|
|
#endif
|
|
{"subn", (PyCFunction)pattern_subn, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_subn_doc},
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
{"subfn", (PyCFunction)pattern_subfn, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_subfn_doc},
|
|
#endif
|
|
{"split", (PyCFunction)pattern_split, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_split_doc},
|
|
{"splititer", (PyCFunction)pattern_splititer, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_splititer_doc},
|
|
{"findall", (PyCFunction)pattern_findall, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_findall_doc},
|
|
{"finditer", (PyCFunction)pattern_finditer, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_finditer_doc},
|
|
{"scanner", (PyCFunction)pattern_scanner, METH_VARARGS|METH_KEYWORDS,
|
|
pattern_scanner_doc},
|
|
{"__copy__", (PyCFunction)pattern_copy, METH_NOARGS},
|
|
{"__deepcopy__", (PyCFunction)pattern_deepcopy, METH_O},
|
|
{NULL, NULL}
|
|
};
|
|
|
|
PyDoc_STRVAR(pattern_doc, "Compiled regex object");
|
|
|
|
/* Deallocates a PatternObject. */
|
|
static void pattern_dealloc(PyObject* self_) {
|
|
PatternObject* self;
|
|
int partial_side;
|
|
size_t i;
|
|
|
|
self = (PatternObject*)self_;
|
|
|
|
/* Discard the nodes. */
|
|
for (i = 0; i < self->node_count; i++) {
|
|
RE_Node* node;
|
|
|
|
node = self->node_list[i];
|
|
re_dealloc(node->values);
|
|
if (node->status & RE_STATUS_STRING) {
|
|
re_dealloc(node->string.bad_character_offset);
|
|
re_dealloc(node->string.good_suffix_offset);
|
|
}
|
|
re_dealloc(node);
|
|
}
|
|
re_dealloc(self->node_list);
|
|
|
|
/* Discard the group info. */
|
|
re_dealloc(self->group_info);
|
|
|
|
/* Discard the call_ref info. */
|
|
re_dealloc(self->call_ref_info);
|
|
|
|
/* Discard the repeat info. */
|
|
re_dealloc(self->repeat_info);
|
|
|
|
dealloc_groups(self->groups_storage, self->true_group_count);
|
|
|
|
dealloc_repeats(self->repeats_storage, self->repeat_count);
|
|
|
|
if (self->weakreflist)
|
|
PyObject_ClearWeakRefs((PyObject*)self);
|
|
Py_XDECREF(self->pattern);
|
|
Py_XDECREF(self->groupindex);
|
|
Py_XDECREF(self->indexgroup);
|
|
|
|
for (partial_side = 0; partial_side < 2; partial_side++) {
|
|
if (self->partial_named_lists[partial_side]) {
|
|
for (i = 0; i < self->named_lists_count; i++)
|
|
Py_XDECREF(self->partial_named_lists[partial_side][i]);
|
|
|
|
re_dealloc(self->partial_named_lists[partial_side]);
|
|
}
|
|
}
|
|
|
|
Py_DECREF(self->named_lists);
|
|
Py_DECREF(self->named_list_indexes);
|
|
PyObject_DEL(self);
|
|
}
|
|
|
|
/* Info about the various flags that can be passed in. */
|
|
typedef struct RE_FlagName {
|
|
char* name;
|
|
int value;
|
|
} RE_FlagName;
|
|
|
|
/* We won't bother about the A flag in Python 2. */
|
|
static RE_FlagName flag_names[] = {
|
|
{"B", RE_FLAG_BESTMATCH},
|
|
{"D", RE_FLAG_DEBUG},
|
|
{"S", RE_FLAG_DOTALL},
|
|
{"F", RE_FLAG_FULLCASE},
|
|
{"I", RE_FLAG_IGNORECASE},
|
|
{"L", RE_FLAG_LOCALE},
|
|
{"M", RE_FLAG_MULTILINE},
|
|
{"R", RE_FLAG_REVERSE},
|
|
{"T", RE_FLAG_TEMPLATE},
|
|
{"U", RE_FLAG_UNICODE},
|
|
{"X", RE_FLAG_VERBOSE},
|
|
{"V0", RE_FLAG_VERSION0},
|
|
{"V1", RE_FLAG_VERSION1},
|
|
{"W", RE_FLAG_WORD},
|
|
};
|
|
|
|
/* Appends a string to a list. */
|
|
Py_LOCAL_INLINE(BOOL) append_string(PyObject* list, char* string) {
|
|
PyObject* item;
|
|
int status;
|
|
|
|
item = Py_BuildValue("s", string);
|
|
if (!item)
|
|
return FALSE;
|
|
|
|
status = PyList_Append(list, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Appends a (decimal) integer to a list. */
|
|
Py_LOCAL_INLINE(BOOL) append_integer(PyObject* list, Py_ssize_t value) {
|
|
PyObject* int_obj;
|
|
PyObject* repr_obj;
|
|
int status;
|
|
|
|
int_obj = Py_BuildValue("n", value);
|
|
if (!int_obj)
|
|
return FALSE;
|
|
|
|
repr_obj = PyObject_Repr(int_obj);
|
|
Py_DECREF(int_obj);
|
|
if (!repr_obj)
|
|
return FALSE;
|
|
|
|
status = PyList_Append(list, repr_obj);
|
|
Py_DECREF(repr_obj);
|
|
if (status < 0)
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* MatchObject's '__repr__' method. */
|
|
static PyObject* match_repr(PyObject* self_) {
|
|
MatchObject* self;
|
|
PyObject* list;
|
|
PyObject* matched_substring;
|
|
PyObject* matched_repr;
|
|
int status;
|
|
PyObject* separator;
|
|
PyObject* result;
|
|
|
|
self = (MatchObject*)self_;
|
|
|
|
list = PyList_New(0);
|
|
if (!list)
|
|
return NULL;
|
|
|
|
if (!append_string(list, "<regex.Match object; span=("))
|
|
goto error;
|
|
|
|
if (!append_integer(list, self->match_start))
|
|
goto error;
|
|
|
|
if (! append_string(list, ", "))
|
|
goto error;
|
|
|
|
if (!append_integer(list, self->match_end))
|
|
goto error;
|
|
|
|
if (!append_string(list, "), match="))
|
|
goto error;
|
|
|
|
matched_substring = get_slice(self->substring, self->match_start -
|
|
self->substring_offset, self->match_end - self->substring_offset);
|
|
if (!matched_substring)
|
|
goto error;
|
|
|
|
matched_repr = PyObject_Repr(matched_substring);
|
|
Py_DECREF(matched_substring);
|
|
if (!matched_repr)
|
|
goto error;
|
|
|
|
status = PyList_Append(list, matched_repr);
|
|
Py_DECREF(matched_repr);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
if (self->fuzzy_counts[RE_FUZZY_SUB] != 0 ||
|
|
self->fuzzy_counts[RE_FUZZY_INS] != 0 || self->fuzzy_counts[RE_FUZZY_DEL]
|
|
!= 0) {
|
|
if (! append_string(list, ", fuzzy_counts=("))
|
|
goto error;
|
|
|
|
if (!append_integer(list,
|
|
(Py_ssize_t)self->fuzzy_counts[RE_FUZZY_SUB]))
|
|
goto error;
|
|
|
|
if (! append_string(list, ", "))
|
|
goto error;
|
|
|
|
if (!append_integer(list,
|
|
(Py_ssize_t)self->fuzzy_counts[RE_FUZZY_INS]))
|
|
goto error;
|
|
|
|
if (! append_string(list, ", "))
|
|
goto error;
|
|
if (!append_integer(list,
|
|
(Py_ssize_t)self->fuzzy_counts[RE_FUZZY_DEL]))
|
|
goto error;
|
|
|
|
if (! append_string(list, ")"))
|
|
goto error;
|
|
}
|
|
|
|
if (self->partial) {
|
|
if (!append_string(list, ", partial=True"))
|
|
goto error;
|
|
}
|
|
|
|
if (! append_string(list, ">"))
|
|
goto error;
|
|
|
|
separator = Py_BuildValue("s", "");
|
|
if (!separator)
|
|
goto error;
|
|
|
|
result = PyUnicode_Join(separator, list);
|
|
Py_DECREF(separator);
|
|
Py_DECREF(list);
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
/* PatternObject's '__repr__' method. */
|
|
static PyObject* pattern_repr(PyObject* self_) {
|
|
PatternObject* self;
|
|
PyObject* list;
|
|
PyObject* item;
|
|
int status;
|
|
int flag_count;
|
|
unsigned int i;
|
|
Py_ssize_t pos;
|
|
PyObject *key;
|
|
PyObject *value;
|
|
PyObject* separator;
|
|
PyObject* result;
|
|
|
|
self = (PatternObject*)self_;
|
|
|
|
list = PyList_New(0);
|
|
if (!list)
|
|
return NULL;
|
|
|
|
if (!append_string(list, "regex.Regex("))
|
|
goto error;
|
|
|
|
item = PyObject_Repr(self->pattern);
|
|
if (!item)
|
|
goto error;
|
|
|
|
status = PyList_Append(list, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
flag_count = 0;
|
|
for (i = 0; i < sizeof(flag_names) / sizeof(flag_names[0]); i++) {
|
|
if (self->flags & flag_names[i].value) {
|
|
if (flag_count == 0) {
|
|
if (!append_string(list, ", flags="))
|
|
goto error;
|
|
} else {
|
|
if (!append_string(list, " | "))
|
|
goto error;
|
|
}
|
|
|
|
if (!append_string(list, "regex."))
|
|
goto error;
|
|
|
|
if (!append_string(list, flag_names[i].name))
|
|
goto error;
|
|
|
|
++flag_count;
|
|
}
|
|
}
|
|
|
|
pos = 0;
|
|
/* PyDict_Next borrows references. */
|
|
while (PyDict_Next(self->named_lists, &pos, &key, &value)) {
|
|
if (!append_string(list, ", "))
|
|
goto error;
|
|
|
|
status = PyList_Append(list, key);
|
|
if (status < 0)
|
|
goto error;
|
|
|
|
if (!append_string(list, "="))
|
|
goto error;
|
|
|
|
item = PyObject_Repr(value);
|
|
if (!item)
|
|
goto error;
|
|
|
|
status = PyList_Append(list, item);
|
|
Py_DECREF(item);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
|
|
if (!append_string(list, ")"))
|
|
goto error;
|
|
|
|
separator = Py_BuildValue("s", "");
|
|
if (!separator)
|
|
goto error;
|
|
|
|
result = PyUnicode_Join(separator, list);
|
|
Py_DECREF(separator);
|
|
Py_DECREF(list);
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
/* PatternObject's 'groupindex' method. */
|
|
static PyObject* pattern_groupindex(PyObject* self_) {
|
|
PatternObject* self;
|
|
|
|
self = (PatternObject*)self_;
|
|
|
|
return PyDict_Copy(self->groupindex);
|
|
}
|
|
|
|
static PyGetSetDef pattern_getset[] = {
|
|
{"groupindex", (getter)pattern_groupindex, (setter)NULL,
|
|
"A dictionary mapping group names to group numbers."},
|
|
{NULL} /* Sentinel */
|
|
};
|
|
|
|
static PyMemberDef pattern_members[] = {
|
|
{"pattern", T_OBJECT, offsetof(PatternObject, pattern), READONLY,
|
|
"The pattern string from which the regex object was compiled."},
|
|
{"flags", T_PYSSIZET, offsetof(PatternObject, flags), READONLY,
|
|
"The regex matching flags."},
|
|
{"groups", T_PYSSIZET, offsetof(PatternObject, public_group_count),
|
|
READONLY, "The number of capturing groups in the pattern."},
|
|
{"named_lists", T_OBJECT, offsetof(PatternObject, named_lists), READONLY,
|
|
"The named lists used by the regex."},
|
|
{NULL} /* Sentinel */
|
|
};
|
|
|
|
static PyTypeObject Pattern_Type = {
|
|
PyObject_HEAD_INIT(NULL)
|
|
0,
|
|
"_" RE_MODULE "." "Pattern",
|
|
sizeof(PatternObject)
|
|
};
|
|
|
|
/* Building the nodes is made simpler by allowing branches to have a single
|
|
* exit. These need to be removed.
|
|
*/
|
|
Py_LOCAL_INLINE(void) skip_one_way_branches(PatternObject* pattern) {
|
|
BOOL modified;
|
|
|
|
/* If a node refers to a 1-way branch then make the former refer to the
|
|
* latter's destination. Repeat until they're all done.
|
|
*/
|
|
do {
|
|
size_t i;
|
|
|
|
modified = FALSE;
|
|
|
|
for (i = 0; i < pattern->node_count; i++) {
|
|
RE_Node* node;
|
|
RE_Node* next;
|
|
|
|
node = pattern->node_list[i];
|
|
|
|
/* Check the first destination. */
|
|
next = node->next_1.node;
|
|
if (next && next->op == RE_OP_BRANCH &&
|
|
!next->nonstring.next_2.node) {
|
|
node->next_1.node = next->next_1.node;
|
|
modified = TRUE;
|
|
}
|
|
|
|
/* Check the second destination. */
|
|
next = node->nonstring.next_2.node;
|
|
if (next && next->op == RE_OP_BRANCH &&
|
|
!next->nonstring.next_2.node) {
|
|
node->nonstring.next_2.node = next->next_1.node;
|
|
modified = TRUE;
|
|
}
|
|
}
|
|
} while (modified);
|
|
|
|
/* The start node might be a 1-way branch. Skip over it because it'll be
|
|
* removed. It might even be the first in a chain.
|
|
*/
|
|
while (pattern->start_node->op == RE_OP_BRANCH &&
|
|
!pattern->start_node->nonstring.next_2.node)
|
|
pattern->start_node = pattern->start_node->next_1.node;
|
|
}
|
|
|
|
/* Adds guards to repeats which are followed by a reference to a group.
|
|
*
|
|
* Returns whether a guard was added for a node at or after the given node.
|
|
*/
|
|
Py_LOCAL_INLINE(RE_STATUS_T) add_repeat_guards(PatternObject* pattern, RE_Node*
|
|
node) {
|
|
RE_STATUS_T result;
|
|
|
|
result = RE_STATUS_NEITHER;
|
|
|
|
for (;;) {
|
|
if (node->status & RE_STATUS_VISITED_AG)
|
|
return node->status & (RE_STATUS_REPEAT | RE_STATUS_REF);
|
|
|
|
switch (node->op) {
|
|
case RE_OP_ATOMIC:
|
|
case RE_OP_LOOKAROUND:
|
|
{
|
|
RE_STATUS_T body_result;
|
|
RE_STATUS_T tail_result;
|
|
RE_STATUS_T status;
|
|
|
|
body_result = add_repeat_guards(pattern,
|
|
node->nonstring.next_2.node);
|
|
tail_result = add_repeat_guards(pattern, node->next_1.node);
|
|
status = max_status_3(result, body_result, tail_result);
|
|
node->status = RE_STATUS_VISITED_AG | status;
|
|
return status;
|
|
}
|
|
case RE_OP_BRANCH:
|
|
{
|
|
RE_STATUS_T branch_1_result;
|
|
RE_STATUS_T branch_2_result;
|
|
RE_STATUS_T status;
|
|
|
|
branch_1_result = add_repeat_guards(pattern, node->next_1.node);
|
|
branch_2_result = add_repeat_guards(pattern,
|
|
node->nonstring.next_2.node);
|
|
status = max_status_3(result, branch_1_result, branch_2_result);
|
|
node->status = RE_STATUS_VISITED_AG | status;
|
|
return status;
|
|
}
|
|
case RE_OP_END_GREEDY_REPEAT:
|
|
case RE_OP_END_LAZY_REPEAT:
|
|
node->status |= RE_STATUS_VISITED_AG;
|
|
return result;
|
|
case RE_OP_GREEDY_REPEAT:
|
|
case RE_OP_LAZY_REPEAT:
|
|
{
|
|
BOOL limited;
|
|
RE_STATUS_T body_result;
|
|
RE_STATUS_T tail_result;
|
|
RE_RepeatInfo* repeat_info;
|
|
RE_STATUS_T status;
|
|
|
|
limited = ~node->values[2] != 0;
|
|
if (limited)
|
|
body_result = RE_STATUS_LIMITED;
|
|
else
|
|
body_result = add_repeat_guards(pattern, node->next_1.node);
|
|
tail_result = add_repeat_guards(pattern,
|
|
node->nonstring.next_2.node);
|
|
|
|
repeat_info = &pattern->repeat_info[node->values[0]];
|
|
if (body_result != RE_STATUS_REF)
|
|
repeat_info->status |= RE_STATUS_BODY;
|
|
if (tail_result != RE_STATUS_REF)
|
|
repeat_info->status |= RE_STATUS_TAIL;
|
|
if (limited)
|
|
result = max_status_2(result, RE_STATUS_LIMITED);
|
|
else
|
|
result = max_status_2(result, RE_STATUS_REPEAT);
|
|
status = max_status_3(result, body_result, tail_result);
|
|
node->status |= RE_STATUS_VISITED_AG | status;
|
|
return status;
|
|
}
|
|
case RE_OP_GREEDY_REPEAT_ONE:
|
|
case RE_OP_LAZY_REPEAT_ONE:
|
|
{
|
|
BOOL limited;
|
|
RE_STATUS_T tail_result;
|
|
RE_RepeatInfo* repeat_info;
|
|
RE_STATUS_T status;
|
|
|
|
limited = ~node->values[2] != 0;
|
|
tail_result = add_repeat_guards(pattern, node->next_1.node);
|
|
|
|
repeat_info = &pattern->repeat_info[node->values[0]];
|
|
repeat_info->status |= RE_STATUS_BODY;
|
|
if (tail_result != RE_STATUS_REF)
|
|
repeat_info->status |= RE_STATUS_TAIL;
|
|
if (limited)
|
|
result = max_status_2(result, RE_STATUS_LIMITED);
|
|
else
|
|
result = max_status_2(result, RE_STATUS_REPEAT);
|
|
status = max_status_3(result, RE_STATUS_REPEAT, tail_result);
|
|
node->status = RE_STATUS_VISITED_AG | status;
|
|
return status;
|
|
}
|
|
case RE_OP_GROUP_EXISTS:
|
|
{
|
|
RE_STATUS_T branch_1_result;
|
|
RE_STATUS_T branch_2_result;
|
|
RE_STATUS_T status;
|
|
|
|
branch_1_result = add_repeat_guards(pattern, node->next_1.node);
|
|
branch_2_result = add_repeat_guards(pattern,
|
|
node->nonstring.next_2.node);
|
|
status = max_status_4(result, branch_1_result, branch_2_result,
|
|
RE_STATUS_REF);
|
|
node->status = RE_STATUS_VISITED_AG | status;
|
|
return status;
|
|
}
|
|
case RE_OP_GROUP_CALL:
|
|
case RE_OP_REF_GROUP:
|
|
case RE_OP_REF_GROUP_FLD:
|
|
case RE_OP_REF_GROUP_FLD_REV:
|
|
case RE_OP_REF_GROUP_IGN:
|
|
case RE_OP_REF_GROUP_IGN_REV:
|
|
case RE_OP_REF_GROUP_REV:
|
|
result = RE_STATUS_REF;
|
|
node = node->next_1.node;
|
|
break;
|
|
case RE_OP_SUCCESS:
|
|
node->status = RE_STATUS_VISITED_AG | result;
|
|
return result;
|
|
default:
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Adds an index to a node's values unless it's already present.
|
|
*
|
|
* 'offset' is the offset of the index count within the values.
|
|
*/
|
|
Py_LOCAL_INLINE(BOOL) add_index(RE_Node* node, size_t offset, size_t index) {
|
|
size_t index_count;
|
|
size_t first_index;
|
|
size_t i;
|
|
RE_CODE* new_values;
|
|
|
|
if (!node)
|
|
return TRUE;
|
|
|
|
index_count = node->values[offset];
|
|
first_index = offset + 1;
|
|
|
|
/* Is the index already present? */
|
|
for (i = 0; i < index_count; i++) {
|
|
if (node->values[first_index + i] == index)
|
|
return TRUE;
|
|
}
|
|
|
|
/* Allocate more space for the new index. */
|
|
new_values = re_realloc(node->values, (node->value_count + 1) *
|
|
sizeof(RE_CODE));
|
|
if (!new_values)
|
|
return FALSE;
|
|
|
|
++node->value_count;
|
|
node->values = new_values;
|
|
|
|
node->values[first_index + node->values[offset]++] = (RE_CODE)index;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Records the index of every repeat and fuzzy section within atomic
|
|
* subpatterns and lookarounds.
|
|
*/
|
|
Py_LOCAL_INLINE(BOOL) record_subpattern_repeats_and_fuzzy_sections(RE_Node*
|
|
parent_node, size_t offset, size_t repeat_count, RE_Node* node) {
|
|
while (node) {
|
|
if (node->status & RE_STATUS_VISITED_REP)
|
|
return TRUE;
|
|
|
|
node->status |= RE_STATUS_VISITED_REP;
|
|
|
|
switch (node->op) {
|
|
case RE_OP_ATOMIC:
|
|
if (!record_subpattern_repeats_and_fuzzy_sections(node, 0,
|
|
repeat_count, node->nonstring.next_2.node))
|
|
return FALSE;
|
|
node = node->next_1.node;
|
|
break;
|
|
case RE_OP_BRANCH:
|
|
if (!record_subpattern_repeats_and_fuzzy_sections(parent_node,
|
|
offset, repeat_count, node->next_1.node))
|
|
return FALSE;
|
|
node = node->nonstring.next_2.node;
|
|
break;
|
|
case RE_OP_END_FUZZY:
|
|
node = node->next_1.node;
|
|
break;
|
|
case RE_OP_END_GREEDY_REPEAT:
|
|
case RE_OP_END_LAZY_REPEAT:
|
|
return TRUE;
|
|
case RE_OP_FUZZY:
|
|
/* Record the fuzzy index. */
|
|
if (!add_index(parent_node, offset, repeat_count +
|
|
node->values[0]))
|
|
return FALSE;
|
|
node = node->next_1.node;
|
|
break;
|
|
case RE_OP_GREEDY_REPEAT:
|
|
case RE_OP_LAZY_REPEAT:
|
|
/* Record the repeat index. */
|
|
if (!add_index(parent_node, offset, node->values[0]))
|
|
return FALSE;
|
|
if (!record_subpattern_repeats_and_fuzzy_sections(parent_node,
|
|
offset, repeat_count, node->next_1.node))
|
|
return FALSE;
|
|
node = node->nonstring.next_2.node;
|
|
break;
|
|
case RE_OP_GREEDY_REPEAT_ONE:
|
|
case RE_OP_LAZY_REPEAT_ONE:
|
|
/* Record the repeat index. */
|
|
if (!add_index(parent_node, offset, node->values[0]))
|
|
return FALSE;
|
|
node = node->next_1.node;
|
|
break;
|
|
case RE_OP_GROUP_EXISTS:
|
|
if (!record_subpattern_repeats_and_fuzzy_sections(parent_node,
|
|
offset, repeat_count, node->next_1.node))
|
|
return FALSE;
|
|
node = node->nonstring.next_2.node;
|
|
break;
|
|
case RE_OP_LOOKAROUND:
|
|
if (!record_subpattern_repeats_and_fuzzy_sections(node, 1,
|
|
repeat_count, node->nonstring.next_2.node))
|
|
return FALSE;
|
|
node = node->next_1.node;
|
|
break;
|
|
default:
|
|
node = node->next_1.node;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Marks nodes which are being used as used. */
|
|
Py_LOCAL_INLINE(void) use_nodes(RE_Node* node) {
|
|
while (node && !(node->status & RE_STATUS_USED)) {
|
|
node->status |= RE_STATUS_USED;
|
|
if (!(node->status & RE_STATUS_STRING)) {
|
|
if (node->nonstring.next_2.node)
|
|
use_nodes(node->nonstring.next_2.node);
|
|
}
|
|
node = node->next_1.node;
|
|
}
|
|
}
|
|
|
|
/* Discards any unused nodes.
|
|
*
|
|
* Optimising the nodes might result in some nodes no longer being used.
|
|
*/
|
|
Py_LOCAL_INLINE(void) discard_unused_nodes(PatternObject* pattern) {
|
|
size_t new_count;
|
|
size_t i;
|
|
|
|
/* Mark the nodes which are being used. */
|
|
use_nodes(pattern->start_node);
|
|
|
|
for (i = 0; i < pattern->call_ref_info_capacity; i++)
|
|
use_nodes(pattern->call_ref_info[i].node);
|
|
|
|
new_count = 0;
|
|
for (i = 0; i < pattern->node_count; i++) {
|
|
RE_Node* node;
|
|
|
|
node = pattern->node_list[i];
|
|
if (node->status & RE_STATUS_USED)
|
|
pattern->node_list[new_count++] = node;
|
|
else {
|
|
re_dealloc(node->values);
|
|
if (node->status & RE_STATUS_STRING) {
|
|
re_dealloc(node->string.bad_character_offset);
|
|
re_dealloc(node->string.good_suffix_offset);
|
|
}
|
|
re_dealloc(node);
|
|
}
|
|
}
|
|
|
|
pattern->node_count = new_count;
|
|
}
|
|
|
|
/* Marks all the group which are named. Returns FALSE if there's an error. */
|
|
Py_LOCAL_INLINE(BOOL) mark_named_groups(PatternObject* pattern) {
|
|
size_t i;
|
|
|
|
for (i = 0; i < pattern->public_group_count; i++) {
|
|
RE_GroupInfo* group_info;
|
|
PyObject* index;
|
|
int status;
|
|
|
|
group_info = &pattern->group_info[i];
|
|
index = Py_BuildValue("n", i + 1);
|
|
if (!index)
|
|
return FALSE;
|
|
|
|
status = PyDict_Contains(pattern->indexgroup, index);
|
|
Py_DECREF(index);
|
|
if (status < 0)
|
|
return FALSE;
|
|
|
|
group_info->has_name = status == 1;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Gets the test node.
|
|
*
|
|
* The test node lets the matcher look ahead in the pattern, allowing it to
|
|
* avoid the cost of housekeeping, only to find that what follows doesn't match
|
|
* anyway.
|
|
*/
|
|
Py_LOCAL_INLINE(void) set_test_node(RE_NextNode* next) {
|
|
RE_Node* node = next->node;
|
|
RE_Node* test;
|
|
|
|
next->test = node;
|
|
next->match_next = node;
|
|
next->match_step = 0;
|
|
|
|
if (!node)
|
|
return;
|
|
|
|
test = node;
|
|
while (test->op == RE_OP_END_GROUP || test->op == RE_OP_START_GROUP)
|
|
test = test->next_1.node;
|
|
|
|
next->test = test;
|
|
|
|
if (test != node)
|
|
return;
|
|
|
|
switch (test->op) {
|
|
case RE_OP_ANY:
|
|
case RE_OP_ANY_ALL:
|
|
case RE_OP_ANY_ALL_REV:
|
|
case RE_OP_ANY_REV:
|
|
case RE_OP_ANY_U:
|
|
case RE_OP_ANY_U_REV:
|
|
case RE_OP_BOUNDARY:
|
|
case RE_OP_CHARACTER:
|
|
case RE_OP_CHARACTER_IGN:
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
case RE_OP_CHARACTER_REV:
|
|
case RE_OP_DEFAULT_BOUNDARY:
|
|
case RE_OP_DEFAULT_END_OF_WORD:
|
|
case RE_OP_DEFAULT_START_OF_WORD:
|
|
case RE_OP_END_OF_LINE:
|
|
case RE_OP_END_OF_LINE_U:
|
|
case RE_OP_END_OF_STRING:
|
|
case RE_OP_END_OF_STRING_LINE:
|
|
case RE_OP_END_OF_STRING_LINE_U:
|
|
case RE_OP_END_OF_WORD:
|
|
case RE_OP_GRAPHEME_BOUNDARY:
|
|
case RE_OP_PROPERTY:
|
|
case RE_OP_PROPERTY_IGN:
|
|
case RE_OP_PROPERTY_IGN_REV:
|
|
case RE_OP_PROPERTY_REV:
|
|
case RE_OP_RANGE:
|
|
case RE_OP_RANGE_IGN:
|
|
case RE_OP_RANGE_IGN_REV:
|
|
case RE_OP_RANGE_REV:
|
|
case RE_OP_SEARCH_ANCHOR:
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
case RE_OP_SET_DIFF_REV:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION:
|
|
case RE_OP_SET_UNION_IGN:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
case RE_OP_START_OF_LINE:
|
|
case RE_OP_START_OF_LINE_U:
|
|
case RE_OP_START_OF_STRING:
|
|
case RE_OP_START_OF_WORD:
|
|
case RE_OP_STRING:
|
|
case RE_OP_STRING_FLD:
|
|
case RE_OP_STRING_FLD_REV:
|
|
case RE_OP_STRING_IGN:
|
|
case RE_OP_STRING_IGN_REV:
|
|
case RE_OP_STRING_REV:
|
|
next->match_next = test->next_1.node;
|
|
next->match_step = test->step;
|
|
break;
|
|
case RE_OP_GREEDY_REPEAT_ONE:
|
|
case RE_OP_LAZY_REPEAT_ONE:
|
|
if (test->values[1] > 0)
|
|
next->test = test;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* Sets the test nodes. */
|
|
Py_LOCAL_INLINE(void) set_test_nodes(PatternObject* pattern) {
|
|
RE_Node** node_list;
|
|
size_t i;
|
|
|
|
node_list = pattern->node_list;
|
|
for (i = 0; i < pattern->node_count; i++) {
|
|
RE_Node* node;
|
|
|
|
node = node_list[i];
|
|
set_test_node(&node->next_1);
|
|
if (!(node->status & RE_STATUS_STRING))
|
|
set_test_node(&node->nonstring.next_2);
|
|
}
|
|
}
|
|
|
|
/* Optimises the pattern. */
|
|
Py_LOCAL_INLINE(BOOL) optimise_pattern(PatternObject* pattern) {
|
|
size_t i;
|
|
|
|
/* Building the nodes is made simpler by allowing branches to have a single
|
|
* exit. These need to be removed.
|
|
*/
|
|
skip_one_way_branches(pattern);
|
|
|
|
/* Add position guards for repeat bodies containing a reference to a group
|
|
* or repeat tails followed at some point by a reference to a group.
|
|
*/
|
|
add_repeat_guards(pattern, pattern->start_node);
|
|
|
|
/* Record the index of repeats and fuzzy sections within the body of atomic
|
|
* and lookaround nodes.
|
|
*/
|
|
if (!record_subpattern_repeats_and_fuzzy_sections(NULL, 0,
|
|
pattern->repeat_count, pattern->start_node))
|
|
return FALSE;
|
|
|
|
for (i = 0; i < pattern->call_ref_info_count; i++) {
|
|
RE_Node* node;
|
|
|
|
node = pattern->call_ref_info[i].node;
|
|
if (!record_subpattern_repeats_and_fuzzy_sections(NULL, 0,
|
|
pattern->repeat_count, node))
|
|
return FALSE;
|
|
}
|
|
|
|
/* Discard any unused nodes. */
|
|
discard_unused_nodes(pattern);
|
|
|
|
/* Set the test nodes. */
|
|
set_test_nodes(pattern);
|
|
|
|
/* Mark all the group that are named. */
|
|
if (!mark_named_groups(pattern))
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Creates a new pattern node. */
|
|
Py_LOCAL_INLINE(RE_Node*) create_node(PatternObject* pattern, RE_UINT8 op,
|
|
RE_CODE flags, Py_ssize_t step, size_t value_count) {
|
|
RE_Node* node;
|
|
|
|
node = (RE_Node*)re_alloc(sizeof(*node));
|
|
if (!node)
|
|
return NULL;
|
|
memset(node, 0, sizeof(RE_Node));
|
|
|
|
node->value_count = value_count;
|
|
if (node->value_count > 0) {
|
|
node->values = (RE_CODE*)re_alloc(node->value_count * sizeof(RE_CODE));
|
|
if (!node->values)
|
|
goto error;
|
|
} else
|
|
node->values = NULL;
|
|
|
|
node->op = op;
|
|
node->match = (flags & RE_POSITIVE_OP) != 0;
|
|
node->status = (RE_STATUS_T)(flags << RE_STATUS_SHIFT);
|
|
node->step = step;
|
|
|
|
/* Ensure that there's enough storage to record the new node. */
|
|
if (pattern->node_count >= pattern->node_capacity) {
|
|
RE_Node** new_node_list;
|
|
|
|
pattern->node_capacity *= 2;
|
|
if (pattern->node_capacity == 0)
|
|
pattern->node_capacity = RE_INIT_NODE_LIST_SIZE;
|
|
new_node_list = (RE_Node**)re_realloc(pattern->node_list,
|
|
pattern->node_capacity * sizeof(RE_Node*));
|
|
if (!new_node_list)
|
|
goto error;
|
|
pattern->node_list = new_node_list;
|
|
}
|
|
|
|
/* Record the new node. */
|
|
pattern->node_list[pattern->node_count++] = node;
|
|
|
|
return node;
|
|
|
|
error:
|
|
re_dealloc(node->values);
|
|
re_dealloc(node);
|
|
return NULL;
|
|
}
|
|
|
|
/* Adds a node as a next node for another node. */
|
|
Py_LOCAL_INLINE(void) add_node(RE_Node* node_1, RE_Node* node_2) {
|
|
if (!node_1->next_1.node)
|
|
node_1->next_1.node = node_2;
|
|
else
|
|
node_1->nonstring.next_2.node = node_2;
|
|
}
|
|
|
|
/* Ensures that the entry for a group's details actually exists. */
|
|
Py_LOCAL_INLINE(BOOL) ensure_group(PatternObject* pattern, size_t group) {
|
|
size_t old_capacity;
|
|
size_t new_capacity;
|
|
RE_GroupInfo* new_group_info;
|
|
|
|
if (group <= pattern->true_group_count)
|
|
/* We already have an entry for the group. */
|
|
return TRUE;
|
|
|
|
/* Increase the storage capacity to include the new entry if it's
|
|
* insufficient.
|
|
*/
|
|
old_capacity = pattern->group_info_capacity;
|
|
new_capacity = pattern->group_info_capacity;
|
|
while (group > new_capacity)
|
|
new_capacity += RE_LIST_SIZE_INC;
|
|
|
|
if (new_capacity > old_capacity) {
|
|
new_group_info = (RE_GroupInfo*)re_realloc(pattern->group_info,
|
|
new_capacity * sizeof(RE_GroupInfo));
|
|
if (!new_group_info)
|
|
return FALSE;
|
|
memset(new_group_info + old_capacity, 0, (new_capacity - old_capacity)
|
|
* sizeof(RE_GroupInfo));
|
|
|
|
pattern->group_info = new_group_info;
|
|
pattern->group_info_capacity = new_capacity;
|
|
}
|
|
|
|
pattern->true_group_count = group;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Records that there's a reference to a group. */
|
|
Py_LOCAL_INLINE(BOOL) record_ref_group(PatternObject* pattern, size_t group) {
|
|
if (!ensure_group(pattern, group))
|
|
return FALSE;
|
|
|
|
pattern->group_info[group - 1].referenced = TRUE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Records that there's a new group. */
|
|
Py_LOCAL_INLINE(BOOL) record_group(PatternObject* pattern, size_t group,
|
|
RE_Node* node) {
|
|
if (!ensure_group(pattern, group))
|
|
return FALSE;
|
|
|
|
if (group >= 1) {
|
|
RE_GroupInfo* info;
|
|
|
|
info = &pattern->group_info[group - 1];
|
|
info->end_index = (Py_ssize_t)pattern->true_group_count;
|
|
info->node = node;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Records that a group has closed. */
|
|
Py_LOCAL_INLINE(void) record_group_end(PatternObject* pattern, size_t group) {
|
|
if (group >= 1)
|
|
pattern->group_info[group - 1].end_index = ++pattern->group_end_index;
|
|
}
|
|
|
|
/* Ensures that the entry for a call_ref's details actually exists. */
|
|
Py_LOCAL_INLINE(BOOL) ensure_call_ref(PatternObject* pattern, size_t call_ref)
|
|
{
|
|
size_t old_capacity;
|
|
size_t new_capacity;
|
|
RE_CallRefInfo* new_call_ref_info;
|
|
|
|
if (call_ref < pattern->call_ref_info_count)
|
|
/* We already have an entry for the call_ref. */
|
|
return TRUE;
|
|
|
|
/* Increase the storage capacity to include the new entry if it's
|
|
* insufficient.
|
|
*/
|
|
old_capacity = pattern->call_ref_info_capacity;
|
|
new_capacity = pattern->call_ref_info_capacity;
|
|
while (call_ref >= new_capacity)
|
|
new_capacity += RE_LIST_SIZE_INC;
|
|
|
|
if (new_capacity > old_capacity) {
|
|
new_call_ref_info = (RE_CallRefInfo*)re_realloc(pattern->call_ref_info,
|
|
new_capacity * sizeof(RE_CallRefInfo));
|
|
if (!new_call_ref_info)
|
|
return FALSE;
|
|
memset(new_call_ref_info + old_capacity, 0, (new_capacity -
|
|
old_capacity) * sizeof(RE_CallRefInfo));
|
|
|
|
pattern->call_ref_info = new_call_ref_info;
|
|
pattern->call_ref_info_capacity = new_capacity;
|
|
}
|
|
|
|
pattern->call_ref_info_count = 1 + call_ref;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Records that a call_ref is defined. */
|
|
Py_LOCAL_INLINE(BOOL) record_call_ref_defined(PatternObject* pattern, size_t
|
|
call_ref, RE_Node* node) {
|
|
if (!ensure_call_ref(pattern, call_ref))
|
|
return FALSE;
|
|
|
|
pattern->call_ref_info[call_ref].defined = TRUE;
|
|
pattern->call_ref_info[call_ref].node = node;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Records that a call_ref is used. */
|
|
Py_LOCAL_INLINE(BOOL) record_call_ref_used(PatternObject* pattern, size_t
|
|
call_ref) {
|
|
if (!ensure_call_ref(pattern, call_ref))
|
|
return FALSE;
|
|
|
|
pattern->call_ref_info[call_ref].used = TRUE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Checks whether a node matches one and only one character. */
|
|
Py_LOCAL_INLINE(BOOL) sequence_matches_one(RE_Node* node) {
|
|
while (node->op == RE_OP_BRANCH && !node->nonstring.next_2.node)
|
|
node = node->next_1.node;
|
|
|
|
if (node->next_1.node || (node->status & RE_STATUS_FUZZY))
|
|
return FALSE;
|
|
|
|
return node_matches_one_character(node);
|
|
}
|
|
|
|
/* Records a repeat. */
|
|
Py_LOCAL_INLINE(BOOL) record_repeat(PatternObject* pattern, size_t index,
|
|
size_t repeat_depth) {
|
|
size_t old_capacity;
|
|
size_t new_capacity;
|
|
|
|
/* Increase the storage capacity to include the new entry if it's
|
|
* insufficient.
|
|
*/
|
|
old_capacity = pattern->repeat_info_capacity;
|
|
new_capacity = pattern->repeat_info_capacity;
|
|
while (index >= new_capacity)
|
|
new_capacity += RE_LIST_SIZE_INC;
|
|
|
|
if (new_capacity > old_capacity) {
|
|
RE_RepeatInfo* new_repeat_info;
|
|
|
|
new_repeat_info = (RE_RepeatInfo*)re_realloc(pattern->repeat_info,
|
|
new_capacity * sizeof(RE_RepeatInfo));
|
|
if (!new_repeat_info)
|
|
return FALSE;
|
|
memset(new_repeat_info + old_capacity, 0, (new_capacity - old_capacity)
|
|
* sizeof(RE_RepeatInfo));
|
|
|
|
pattern->repeat_info = new_repeat_info;
|
|
pattern->repeat_info_capacity = new_capacity;
|
|
}
|
|
|
|
if (index >= pattern->repeat_count)
|
|
pattern->repeat_count = index + 1;
|
|
|
|
if (repeat_depth > 0)
|
|
pattern->repeat_info[index].status |= RE_STATUS_INNER;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
Py_LOCAL_INLINE(Py_ssize_t) get_step(RE_CODE op) {
|
|
switch (op) {
|
|
case RE_OP_ANY:
|
|
case RE_OP_ANY_ALL:
|
|
case RE_OP_ANY_U:
|
|
case RE_OP_CHARACTER:
|
|
case RE_OP_CHARACTER_IGN:
|
|
case RE_OP_PROPERTY:
|
|
case RE_OP_PROPERTY_IGN:
|
|
case RE_OP_RANGE:
|
|
case RE_OP_RANGE_IGN:
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_UNION:
|
|
case RE_OP_SET_UNION_IGN:
|
|
case RE_OP_STRING:
|
|
case RE_OP_STRING_FLD:
|
|
case RE_OP_STRING_IGN:
|
|
return 1;
|
|
case RE_OP_ANY_ALL_REV:
|
|
case RE_OP_ANY_REV:
|
|
case RE_OP_ANY_U_REV:
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
case RE_OP_CHARACTER_REV:
|
|
case RE_OP_PROPERTY_IGN_REV:
|
|
case RE_OP_PROPERTY_REV:
|
|
case RE_OP_RANGE_IGN_REV:
|
|
case RE_OP_RANGE_REV:
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
case RE_OP_SET_DIFF_REV:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
case RE_OP_STRING_FLD_REV:
|
|
case RE_OP_STRING_IGN_REV:
|
|
case RE_OP_STRING_REV:
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
Py_LOCAL_INLINE(int) build_sequence(RE_CompileArgs* args);
|
|
|
|
/* Builds an ANY node. */
|
|
Py_LOCAL_INLINE(int) build_ANY(RE_CompileArgs* args) {
|
|
RE_UINT8 op;
|
|
RE_CODE flags;
|
|
Py_ssize_t step;
|
|
RE_Node* node;
|
|
|
|
/* codes: opcode, flags. */
|
|
if (args->code + 1 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
op = (RE_UINT8)args->code[0];
|
|
flags = args->code[1];
|
|
|
|
step = get_step(op);
|
|
|
|
/* Create the node. */
|
|
node = create_node(args->pattern, op, flags, step, 0);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
args->code += 2;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
++args->min_width;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a FUZZY node. */
|
|
Py_LOCAL_INLINE(int) build_FUZZY(RE_CompileArgs* args) {
|
|
RE_CODE flags;
|
|
RE_Node* start_node;
|
|
RE_Node* end_node;
|
|
RE_CODE index;
|
|
RE_CompileArgs subargs;
|
|
int status;
|
|
|
|
/* codes: opcode, flags, constraints, sequence, end. */
|
|
if (args->code + 13 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
flags = args->code[1];
|
|
|
|
/* Create nodes for the start and end of the fuzzy sequence. */
|
|
start_node = create_node(args->pattern, RE_OP_FUZZY, flags, 0, 9);
|
|
end_node = create_node(args->pattern, RE_OP_END_FUZZY, flags, 0, 5);
|
|
if (!start_node || !end_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
index = (RE_CODE)args->pattern->fuzzy_count++;
|
|
start_node->values[0] = index;
|
|
end_node->values[0] = index;
|
|
|
|
/* The constraints consist of 4 pairs of limits and the cost equation. */
|
|
end_node->values[RE_FUZZY_VAL_MIN_DEL] = args->code[2]; /* Deletion minimum. */
|
|
end_node->values[RE_FUZZY_VAL_MIN_INS] = args->code[4]; /* Insertion minimum. */
|
|
end_node->values[RE_FUZZY_VAL_MIN_SUB] = args->code[6]; /* Substitution minimum. */
|
|
end_node->values[RE_FUZZY_VAL_MIN_ERR] = args->code[8]; /* Error minimum. */
|
|
|
|
start_node->values[RE_FUZZY_VAL_MAX_DEL] = args->code[3]; /* Deletion maximum. */
|
|
start_node->values[RE_FUZZY_VAL_MAX_INS] = args->code[5]; /* Insertion maximum. */
|
|
start_node->values[RE_FUZZY_VAL_MAX_SUB] = args->code[7]; /* Substitution maximum. */
|
|
start_node->values[RE_FUZZY_VAL_MAX_ERR] = args->code[9]; /* Error maximum. */
|
|
|
|
start_node->values[RE_FUZZY_VAL_DEL_COST] = args->code[10]; /* Deletion cost. */
|
|
start_node->values[RE_FUZZY_VAL_INS_COST] = args->code[11]; /* Insertion cost. */
|
|
start_node->values[RE_FUZZY_VAL_SUB_COST] = args->code[12]; /* Substitution cost. */
|
|
start_node->values[RE_FUZZY_VAL_MAX_COST] = args->code[13]; /* Total cost. */
|
|
|
|
args->code += 14;
|
|
|
|
subargs = *args;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = TRUE;
|
|
subargs.within_fuzzy = TRUE;
|
|
|
|
/* Compile the sequence and check that we've reached the end of the
|
|
* subpattern.
|
|
*/
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code = subargs.code;
|
|
args->min_width = subargs.min_width;
|
|
args->has_captures |= subargs.has_captures;
|
|
|
|
++args->code;
|
|
|
|
/* Append the fuzzy sequence. */
|
|
add_node(args->end, start_node);
|
|
add_node(start_node, subargs.start);
|
|
add_node(subargs.end, end_node);
|
|
args->end = end_node;
|
|
|
|
args->is_fuzzy = TRUE;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds an ATOMIC node. */
|
|
Py_LOCAL_INLINE(int) build_ATOMIC(RE_CompileArgs* args) {
|
|
RE_Node* atomic_node;
|
|
RE_CompileArgs subargs;
|
|
RE_Node* success_node;
|
|
int status;
|
|
|
|
/* codes: opcode, sequence, end. */
|
|
if (args->code + 1 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
atomic_node = create_node(args->pattern, RE_OP_ATOMIC, 0, 0, 1);
|
|
if (!atomic_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* The number of repeat indexes. */
|
|
atomic_node->values[0] = 0;
|
|
|
|
++args->code;
|
|
|
|
subargs = *args;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
|
|
/* Compile the sequence and check that we've reached the end of the
|
|
* subpattern.
|
|
*/
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
/* Create the success node to terminate the subpattern. */
|
|
success_node = create_node(subargs.pattern, RE_OP_SUCCESS, 0, 0, 0);
|
|
if (!success_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Append the SUCCESS node. */
|
|
add_node(subargs.end, success_node);
|
|
|
|
/* Insert the subpattern. */
|
|
atomic_node->nonstring.next_2.node = subargs.start;
|
|
|
|
args->code = subargs.code;
|
|
args->min_width = subargs.min_width;
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
++args->code;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, atomic_node);
|
|
args->end = atomic_node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a BOUNDARY node. */
|
|
Py_LOCAL_INLINE(int) build_BOUNDARY(RE_CompileArgs* args) {
|
|
RE_UINT8 op;
|
|
RE_CODE flags;
|
|
RE_Node* node;
|
|
|
|
/* codes: opcode, flags. */
|
|
if (args->code + 1 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
op = (RE_UINT8)args->code[0];
|
|
flags = args->code[1];
|
|
|
|
args->code += 2;
|
|
|
|
/* Create the node. */
|
|
node = create_node(args->pattern, op, flags, 0, 0);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a BRANCH node. */
|
|
Py_LOCAL_INLINE(int) build_BRANCH(RE_CompileArgs* args) {
|
|
RE_Node* branch_node;
|
|
RE_Node* join_node;
|
|
Py_ssize_t smallest_min_width;
|
|
RE_CompileArgs subargs;
|
|
int status;
|
|
|
|
/* codes: opcode, branch, next, branch, end. */
|
|
if (args->code + 2 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
/* Create nodes for the start and end of the branch sequence. */
|
|
branch_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0);
|
|
join_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0);
|
|
if (!branch_node || !join_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, branch_node);
|
|
args->end = join_node;
|
|
|
|
smallest_min_width = PY_SSIZE_T_MAX;
|
|
|
|
subargs = *args;
|
|
|
|
/* A branch in the regular expression is compiled into a series of 2-way
|
|
* branches.
|
|
*/
|
|
do {
|
|
RE_Node* next_branch_node;
|
|
|
|
/* Skip over the 'BRANCH' or 'NEXT' opcode. */
|
|
++subargs.code;
|
|
|
|
/* Compile the sequence until the next 'BRANCH' or 'NEXT' opcode. */
|
|
subargs.min_width = 0;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
smallest_min_width = min_ssize_t(smallest_min_width,
|
|
subargs.min_width);
|
|
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
/* Append the sequence. */
|
|
add_node(branch_node, subargs.start);
|
|
add_node(subargs.end, join_node);
|
|
|
|
/* Create a start node for the next sequence and append it. */
|
|
next_branch_node = create_node(subargs.pattern, RE_OP_BRANCH, 0, 0, 0);
|
|
if (!next_branch_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
add_node(branch_node, next_branch_node);
|
|
branch_node = next_branch_node;
|
|
} while (subargs.code < subargs.end_code && subargs.code[0] == RE_OP_NEXT);
|
|
|
|
/* We should have reached the end of the branch. */
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code = subargs.code;
|
|
|
|
++args->code;
|
|
args->min_width += smallest_min_width;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a CALL_REF node. */
|
|
Py_LOCAL_INLINE(int) build_CALL_REF(RE_CompileArgs* args) {
|
|
RE_CODE call_ref;
|
|
RE_Node* start_node;
|
|
RE_Node* end_node;
|
|
RE_CompileArgs subargs;
|
|
int status;
|
|
|
|
/* codes: opcode, call_ref. */
|
|
if (args->code + 1 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
call_ref = args->code[1];
|
|
|
|
args->code += 2;
|
|
|
|
/* Create nodes for the start and end of the subpattern. */
|
|
start_node = create_node(args->pattern, RE_OP_CALL_REF, 0, 0, 1);
|
|
end_node = create_node(args->pattern, RE_OP_GROUP_RETURN, 0, 0, 0);
|
|
if (!start_node || !end_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
start_node->values[0] = call_ref;
|
|
|
|
/* Compile the sequence and check that we've reached the end of the
|
|
* subpattern.
|
|
*/
|
|
subargs = *args;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code = subargs.code;
|
|
args->min_width = subargs.min_width;
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
++args->code;
|
|
|
|
/* Record that we defined a call_ref. */
|
|
if (!record_call_ref_defined(args->pattern, call_ref, start_node))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, start_node);
|
|
add_node(start_node, subargs.start);
|
|
add_node(subargs.end, end_node);
|
|
args->end = end_node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a CHARACTER or PROPERTY node. */
|
|
Py_LOCAL_INLINE(int) build_CHARACTER_or_PROPERTY(RE_CompileArgs* args) {
|
|
RE_UINT8 op;
|
|
RE_CODE flags;
|
|
Py_ssize_t step;
|
|
RE_Node* node;
|
|
|
|
/* codes: opcode, flags, value. */
|
|
if (args->code + 2 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
op = (RE_UINT8)args->code[0];
|
|
flags = args->code[1];
|
|
|
|
step = get_step(op);
|
|
|
|
if (flags & RE_ZEROWIDTH_OP)
|
|
step = 0;
|
|
|
|
/* Create the node. */
|
|
node = create_node(args->pattern, op, flags, step, 1);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
node->values[0] = args->code[2];
|
|
|
|
args->code += 3;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
if (step != 0)
|
|
++args->min_width;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a GROUP node. */
|
|
Py_LOCAL_INLINE(int) build_GROUP(RE_CompileArgs* args) {
|
|
RE_CODE private_group;
|
|
RE_CODE public_group;
|
|
RE_Node* start_node;
|
|
RE_Node* end_node;
|
|
RE_CompileArgs subargs;
|
|
int status;
|
|
|
|
/* codes: opcode, private_group, public_group. */
|
|
if (args->code + 2 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
private_group = args->code[1];
|
|
public_group = args->code[2];
|
|
|
|
args->code += 3;
|
|
|
|
/* Create nodes for the start and end of the capture group. */
|
|
start_node = create_node(args->pattern, args->forward ? RE_OP_START_GROUP :
|
|
RE_OP_END_GROUP, 0, 0, 3);
|
|
end_node = create_node(args->pattern, args->forward ? RE_OP_END_GROUP :
|
|
RE_OP_START_GROUP, 0, 0, 3);
|
|
if (!start_node || !end_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
start_node->values[0] = private_group;
|
|
end_node->values[0] = private_group;
|
|
start_node->values[1] = public_group;
|
|
end_node->values[1] = public_group;
|
|
|
|
/* Signal that the capture should be saved when it's complete. */
|
|
start_node->values[2] = 0;
|
|
end_node->values[2] = 1;
|
|
|
|
/* Record that we have a new capture group. */
|
|
if (!record_group(args->pattern, private_group, start_node))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Compile the sequence and check that we've reached the end of the capture
|
|
* group.
|
|
*/
|
|
subargs = *args;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code = subargs.code;
|
|
args->min_width = subargs.min_width;
|
|
if (subargs.has_captures || subargs.visible_captures)
|
|
args->has_captures = TRUE;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
++args->code;
|
|
|
|
/* Record that the capture group has closed. */
|
|
record_group_end(args->pattern, private_group);
|
|
|
|
/* Append the capture group. */
|
|
add_node(args->end, start_node);
|
|
add_node(start_node, subargs.start);
|
|
add_node(subargs.end, end_node);
|
|
args->end = end_node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a GROUP_CALL node. */
|
|
Py_LOCAL_INLINE(int) build_GROUP_CALL(RE_CompileArgs* args) {
|
|
RE_CODE call_ref;
|
|
RE_Node* node;
|
|
|
|
/* codes: opcode, call_ref. */
|
|
if (args->code + 1 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
call_ref = args->code[1];
|
|
|
|
/* Create the node. */
|
|
node = create_node(args->pattern, RE_OP_GROUP_CALL, 0, 0, 1);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
node->values[0] = call_ref;
|
|
|
|
args->code += 2;
|
|
|
|
/* Record that we used a call_ref. */
|
|
if (!record_call_ref_used(args->pattern, call_ref))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a GROUP_EXISTS node. */
|
|
Py_LOCAL_INLINE(int) build_GROUP_EXISTS(RE_CompileArgs* args) {
|
|
RE_CODE group;
|
|
RE_Node* start_node;
|
|
RE_Node* end_node;
|
|
RE_CompileArgs subargs;
|
|
Py_ssize_t min_width;
|
|
int status;
|
|
|
|
/* codes: opcode, sequence, next, sequence, end. */
|
|
if (args->code + 2 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
group = args->code[1];
|
|
|
|
args->code += 2;
|
|
|
|
/* Create nodes for the start and end of the structure. */
|
|
start_node = create_node(args->pattern, RE_OP_GROUP_EXISTS, 0, 0, 1);
|
|
end_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0);
|
|
if (!start_node || !end_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
start_node->values[0] = group;
|
|
|
|
subargs = *args;
|
|
subargs.min_width = 0;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
args->code = subargs.code;
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
min_width = subargs.min_width;
|
|
|
|
/* Append the start node. */
|
|
add_node(args->end, start_node);
|
|
add_node(start_node, subargs.start);
|
|
add_node(subargs.end, end_node);
|
|
|
|
if (args->code[0] == RE_OP_NEXT) {
|
|
++args->code;
|
|
|
|
subargs.code = args->code;
|
|
subargs.min_width = 0;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
args->code = subargs.code;
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
min_width = min_ssize_t(min_width, subargs.min_width);
|
|
|
|
add_node(start_node, subargs.start);
|
|
add_node(subargs.end, end_node);
|
|
} else {
|
|
add_node(start_node, end_node);
|
|
|
|
min_width = 0;
|
|
}
|
|
|
|
args->min_width += min_width;
|
|
|
|
if (args->code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
++args->code;
|
|
|
|
args->end = end_node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a LOOKAROUND node. */
|
|
Py_LOCAL_INLINE(int) build_LOOKAROUND(RE_CompileArgs* args) {
|
|
RE_CODE flags;
|
|
BOOL forward;
|
|
RE_Node* lookaround_node;
|
|
RE_Node* success_node;
|
|
RE_CompileArgs subargs;
|
|
int status;
|
|
|
|
/* codes: opcode, flags, forward, sequence, end. */
|
|
if (args->code + 3 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
flags = args->code[1];
|
|
forward = (BOOL)args->code[2];
|
|
|
|
/* Create a node for the lookaround. */
|
|
lookaround_node = create_node(args->pattern, RE_OP_LOOKAROUND, flags, 0,
|
|
2);
|
|
if (!lookaround_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* The number of repeat indexes. */
|
|
lookaround_node->values[1] = 0;
|
|
|
|
args->code += 3;
|
|
|
|
/* Compile the sequence and check that we've reached the end of the
|
|
* subpattern.
|
|
*/
|
|
subargs = *args;
|
|
subargs.forward = forward;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
lookaround_node->values[0] = subargs.has_captures;
|
|
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code = subargs.code;
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
++args->code;
|
|
|
|
/* Create the 'SUCCESS' node and append it to the subpattern. */
|
|
success_node = create_node(args->pattern, RE_OP_SUCCESS, 0, 0, 0);
|
|
if (!success_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Append the SUCCESS node. */
|
|
add_node(subargs.end, success_node);
|
|
|
|
/* Insert the subpattern into the node. */
|
|
lookaround_node->nonstring.next_2.node = subargs.start;
|
|
|
|
/* Append the lookaround. */
|
|
add_node(args->end, lookaround_node);
|
|
args->end = lookaround_node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a RANGE node. */
|
|
Py_LOCAL_INLINE(int) build_RANGE(RE_CompileArgs* args) {
|
|
RE_UINT8 op;
|
|
RE_CODE flags;
|
|
Py_ssize_t step;
|
|
RE_Node* node;
|
|
|
|
/* codes: opcode, flags, lower, upper. */
|
|
if (args->code + 3 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
op = (RE_UINT8)args->code[0];
|
|
flags = args->code[1];
|
|
|
|
step = get_step(op);
|
|
|
|
if (flags & RE_ZEROWIDTH_OP)
|
|
step = 0;
|
|
|
|
/* Create the node. */
|
|
node = create_node(args->pattern, op, flags, step, 2);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
node->values[0] = args->code[2];
|
|
node->values[1] = args->code[3];
|
|
|
|
args->code += 4;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
if (step != 0)
|
|
++args->min_width;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a REF_GROUP node. */
|
|
Py_LOCAL_INLINE(int) build_REF_GROUP(RE_CompileArgs* args) {
|
|
RE_CODE flags;
|
|
RE_CODE group;
|
|
RE_Node* node;
|
|
|
|
/* codes: opcode, flags, group. */
|
|
if (args->code + 2 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
flags = args->code[1];
|
|
group = args->code[2];
|
|
node = create_node(args->pattern, (RE_UINT8)args->code[0], flags, 0, 1);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
node->values[0] = group;
|
|
|
|
args->code += 3;
|
|
|
|
/* Record that we have a reference to a group. */
|
|
if (!record_ref_group(args->pattern, group))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Append the reference. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a REPEAT node. */
|
|
Py_LOCAL_INLINE(int) build_REPEAT(RE_CompileArgs* args) {
|
|
BOOL greedy;
|
|
RE_CODE min_count;
|
|
RE_CODE max_count;
|
|
int status;
|
|
|
|
/* codes: opcode, min_count, max_count, sequence, end. */
|
|
if (args->code + 3 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
/* This includes special cases such as optional items, which we'll check
|
|
* for and treat specially. They don't need repeat counts, which helps us
|
|
* avoid unnecessary work when matching.
|
|
*/
|
|
greedy = args->code[0] == RE_OP_GREEDY_REPEAT;
|
|
min_count = args->code[1];
|
|
max_count = args->code[2];
|
|
if (args->code[1] > args->code[2])
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code += 3;
|
|
|
|
if (min_count == 0 && max_count == 1) {
|
|
/* Optional sequence. */
|
|
RE_Node* branch_node;
|
|
RE_Node* join_node;
|
|
RE_CompileArgs subargs;
|
|
|
|
/* Create the start and end nodes. */
|
|
branch_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0);
|
|
join_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0);
|
|
if (!branch_node || !join_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Compile the sequence and check that we've reached the end of it. */
|
|
subargs = *args;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code = subargs.code;
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
++args->code;
|
|
|
|
if (greedy) {
|
|
/* It's a greedy option. */
|
|
add_node(branch_node, subargs.start);
|
|
add_node(branch_node, join_node);
|
|
} else {
|
|
/* It's a lazy option. */
|
|
add_node(branch_node, join_node);
|
|
add_node(branch_node, subargs.start);
|
|
}
|
|
add_node(subargs.end, join_node);
|
|
|
|
/* Append the optional sequence. */
|
|
add_node(args->end, branch_node);
|
|
args->end = join_node;
|
|
} else if (min_count == 1 && max_count == 1) {
|
|
/* Singly-repeated sequence. */
|
|
RE_CompileArgs subargs;
|
|
|
|
subargs = *args;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code = subargs.code;
|
|
args->min_width = subargs.min_width;
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
++args->code;
|
|
|
|
/* Append the sequence. */
|
|
add_node(args->end, subargs.start);
|
|
args->end = subargs.end;
|
|
} else {
|
|
size_t index;
|
|
RE_Node* repeat_node;
|
|
RE_CompileArgs subargs;
|
|
|
|
index = args->pattern->repeat_count;
|
|
|
|
/* Create the nodes for the repeat. */
|
|
repeat_node = create_node(args->pattern, greedy ? RE_OP_GREEDY_REPEAT :
|
|
RE_OP_LAZY_REPEAT, 0, args->forward ? 1 : -1, 4);
|
|
if (!repeat_node || !record_repeat(args->pattern, index,
|
|
args->repeat_depth))
|
|
return RE_ERROR_MEMORY;
|
|
|
|
repeat_node->values[0] = (RE_CODE)index;
|
|
repeat_node->values[1] = min_count;
|
|
repeat_node->values[2] = max_count;
|
|
repeat_node->values[3] = args->forward;
|
|
|
|
if (args->within_fuzzy)
|
|
args->pattern->repeat_info[index].status |= RE_STATUS_BODY;
|
|
|
|
/* Compile the 'body' and check that we've reached the end of it. */
|
|
subargs = *args;
|
|
subargs.min_width = 0;
|
|
subargs.visible_captures = TRUE;
|
|
subargs.has_captures = FALSE;
|
|
subargs.is_fuzzy = FALSE;
|
|
++subargs.repeat_depth;
|
|
status = build_sequence(&subargs);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
|
|
if (subargs.code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
args->code = subargs.code;
|
|
args->min_width += (Py_ssize_t)min_count * subargs.min_width;
|
|
args->has_captures |= subargs.has_captures;
|
|
args->is_fuzzy |= subargs.is_fuzzy;
|
|
|
|
++args->code;
|
|
|
|
/* Is it a repeat of something which will match a single character?
|
|
*
|
|
* If it's in a fuzzy section then it won't be optimised as a
|
|
* single-character repeat.
|
|
*/
|
|
if (sequence_matches_one(subargs.start)) {
|
|
repeat_node->op = greedy ? RE_OP_GREEDY_REPEAT_ONE :
|
|
RE_OP_LAZY_REPEAT_ONE;
|
|
|
|
/* Append the new sequence. */
|
|
add_node(args->end, repeat_node);
|
|
repeat_node->nonstring.next_2.node = subargs.start;
|
|
args->end = repeat_node;
|
|
} else {
|
|
RE_Node* end_repeat_node;
|
|
RE_Node* end_node;
|
|
|
|
end_repeat_node = create_node(args->pattern, greedy ?
|
|
RE_OP_END_GREEDY_REPEAT : RE_OP_END_LAZY_REPEAT, 0, args->forward
|
|
? 1 : -1, 4);
|
|
if (!end_repeat_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
end_repeat_node->values[0] = repeat_node->values[0];
|
|
end_repeat_node->values[1] = repeat_node->values[1];
|
|
end_repeat_node->values[2] = repeat_node->values[2];
|
|
end_repeat_node->values[3] = args->forward;
|
|
|
|
end_node = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0);
|
|
if (!end_node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
/* Append the new sequence. */
|
|
add_node(args->end, repeat_node);
|
|
add_node(repeat_node, subargs.start);
|
|
add_node(repeat_node, end_node);
|
|
add_node(subargs.end, end_repeat_node);
|
|
add_node(end_repeat_node, subargs.start);
|
|
add_node(end_repeat_node, end_node);
|
|
args->end = end_node;
|
|
}
|
|
}
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a STRING node. */
|
|
Py_LOCAL_INLINE(int) build_STRING(RE_CompileArgs* args, BOOL is_charset) {
|
|
RE_CODE flags;
|
|
RE_CODE length;
|
|
RE_UINT8 op;
|
|
Py_ssize_t step;
|
|
RE_Node* node;
|
|
size_t i;
|
|
|
|
/* codes: opcode, flags, length, characters. */
|
|
flags = args->code[1];
|
|
length = args->code[2];
|
|
if (args->code + 3 + length > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
op = (RE_UINT8)args->code[0];
|
|
|
|
step = get_step(op);
|
|
|
|
/* Create the node. */
|
|
node = create_node(args->pattern, op, flags, step * (Py_ssize_t)length,
|
|
length);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
if (!is_charset)
|
|
node->status |= RE_STATUS_STRING;
|
|
|
|
for (i = 0; i < length; i++)
|
|
node->values[i] = args->code[3 + i];
|
|
|
|
args->code += 3 + length;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
/* Because of full case-folding, one character in the text could match
|
|
* multiple characters in the pattern.
|
|
*/
|
|
if (op == RE_OP_STRING_FLD || op == RE_OP_STRING_FLD_REV)
|
|
args->min_width += possible_unfolded_length((Py_ssize_t)length);
|
|
else
|
|
args->min_width += (Py_ssize_t)length;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a SET node. */
|
|
Py_LOCAL_INLINE(int) build_SET(RE_CompileArgs* args) {
|
|
RE_UINT8 op;
|
|
RE_CODE flags;
|
|
Py_ssize_t step;
|
|
RE_Node* node;
|
|
Py_ssize_t saved_min_width;
|
|
int status;
|
|
|
|
/* codes: opcode, flags, members. */
|
|
op = (RE_UINT8)args->code[0];
|
|
flags = args->code[1];
|
|
|
|
step = get_step(op);
|
|
|
|
if (flags & RE_ZEROWIDTH_OP)
|
|
step = 0;
|
|
|
|
node = create_node(args->pattern, op, flags, step, 0);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
args->code += 2;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
saved_min_width = args->min_width;
|
|
|
|
/* Compile the character set. */
|
|
do {
|
|
switch (args->code[0]) {
|
|
case RE_OP_CHARACTER:
|
|
case RE_OP_PROPERTY:
|
|
status = build_CHARACTER_or_PROPERTY(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_RANGE:
|
|
status = build_RANGE(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_UNION:
|
|
status = build_SET(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_STRING:
|
|
/* A set of characters. */
|
|
if (!build_STRING(args, TRUE))
|
|
return FALSE;
|
|
break;
|
|
default:
|
|
/* Illegal opcode for a character set. */
|
|
return RE_ERROR_ILLEGAL;
|
|
}
|
|
} while (args->code < args->end_code && args->code[0] != RE_OP_END);
|
|
|
|
/* Check that we've reached the end correctly. (The last opcode should be
|
|
* 'END'.)
|
|
*/
|
|
if (args->code >= args->end_code || args->code[0] != RE_OP_END)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
++args->code;
|
|
|
|
/* At this point the set's members are in the main sequence. They need to
|
|
* be moved out-of-line.
|
|
*/
|
|
node->nonstring.next_2.node = node->next_1.node;
|
|
node->next_1.node = NULL;
|
|
args->end = node;
|
|
|
|
args->min_width = saved_min_width;
|
|
|
|
if (step != 0)
|
|
++args->min_width;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a STRING_SET node. */
|
|
Py_LOCAL_INLINE(int) build_STRING_SET(RE_CompileArgs* args) {
|
|
RE_CODE index;
|
|
RE_CODE min_len;
|
|
RE_CODE max_len;
|
|
RE_Node* node;
|
|
|
|
/* codes: opcode, index, min_len, max_len. */
|
|
if (args->code + 3 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
index = args->code[1];
|
|
min_len = args->code[2];
|
|
max_len = args->code[3];
|
|
node = create_node(args->pattern, (RE_UINT8)args->code[0], 0, 0, 3);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
node->values[0] = index;
|
|
node->values[1] = min_len;
|
|
node->values[2] = max_len;
|
|
|
|
args->code += 4;
|
|
|
|
/* Append the reference. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a SUCCESS node . */
|
|
Py_LOCAL_INLINE(int) build_SUCCESS(RE_CompileArgs* args) {
|
|
RE_Node* node;
|
|
/* code: opcode. */
|
|
|
|
/* Create the node. */
|
|
node = create_node(args->pattern, RE_OP_SUCCESS, 0, 0, 0);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
++args->code;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a zero-width node. */
|
|
Py_LOCAL_INLINE(int) build_zerowidth(RE_CompileArgs* args) {
|
|
RE_CODE flags;
|
|
RE_Node* node;
|
|
|
|
/* codes: opcode, flags. */
|
|
if (args->code + 1 > args->end_code)
|
|
return RE_ERROR_ILLEGAL;
|
|
|
|
flags = args->code[1];
|
|
|
|
/* Create the node. */
|
|
node = create_node(args->pattern, (RE_UINT8)args->code[0], flags, 0, 0);
|
|
if (!node)
|
|
return RE_ERROR_MEMORY;
|
|
|
|
args->code += 2;
|
|
|
|
/* Append the node. */
|
|
add_node(args->end, node);
|
|
args->end = node;
|
|
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
|
|
/* Builds a sequence of nodes from regular expression code. */
|
|
Py_LOCAL_INLINE(int) build_sequence(RE_CompileArgs* args) {
|
|
int status;
|
|
|
|
/* Guarantee that there's something to attach to. */
|
|
args->start = create_node(args->pattern, RE_OP_BRANCH, 0, 0, 0);
|
|
args->end = args->start;
|
|
|
|
/* The sequence should end with an opcode we don't understand. If it
|
|
* doesn't then the code is illegal.
|
|
*/
|
|
while (args->code < args->end_code) {
|
|
/* The following code groups opcodes by format, not function. */
|
|
switch (args->code[0]) {
|
|
case RE_OP_ANY:
|
|
case RE_OP_ANY_ALL:
|
|
case RE_OP_ANY_ALL_REV:
|
|
case RE_OP_ANY_REV:
|
|
case RE_OP_ANY_U:
|
|
case RE_OP_ANY_U_REV:
|
|
/* A simple opcode with no trailing codewords and width of 1. */
|
|
status = build_ANY(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_ATOMIC:
|
|
/* An atomic sequence. */
|
|
status = build_ATOMIC(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_BOUNDARY:
|
|
case RE_OP_DEFAULT_BOUNDARY:
|
|
case RE_OP_DEFAULT_END_OF_WORD:
|
|
case RE_OP_DEFAULT_START_OF_WORD:
|
|
case RE_OP_END_OF_WORD:
|
|
case RE_OP_GRAPHEME_BOUNDARY:
|
|
case RE_OP_START_OF_WORD:
|
|
/* A word or grapheme boundary. */
|
|
status = build_BOUNDARY(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_BRANCH:
|
|
/* A 2-way branch. */
|
|
status = build_BRANCH(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_CALL_REF:
|
|
/* A group call ref. */
|
|
status = build_CALL_REF(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_CHARACTER:
|
|
case RE_OP_CHARACTER_IGN:
|
|
case RE_OP_CHARACTER_IGN_REV:
|
|
case RE_OP_CHARACTER_REV:
|
|
case RE_OP_PROPERTY:
|
|
case RE_OP_PROPERTY_IGN:
|
|
case RE_OP_PROPERTY_IGN_REV:
|
|
case RE_OP_PROPERTY_REV:
|
|
/* A character literal or a property. */
|
|
status = build_CHARACTER_or_PROPERTY(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_END_OF_LINE:
|
|
case RE_OP_END_OF_LINE_U:
|
|
case RE_OP_END_OF_STRING:
|
|
case RE_OP_END_OF_STRING_LINE:
|
|
case RE_OP_END_OF_STRING_LINE_U:
|
|
case RE_OP_SEARCH_ANCHOR:
|
|
case RE_OP_START_OF_LINE:
|
|
case RE_OP_START_OF_LINE_U:
|
|
case RE_OP_START_OF_STRING:
|
|
/* A simple opcode with no trailing codewords and width of 0. */
|
|
status = build_zerowidth(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_FUZZY:
|
|
/* A fuzzy sequence. */
|
|
status = build_FUZZY(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_GREEDY_REPEAT:
|
|
case RE_OP_LAZY_REPEAT:
|
|
/* A repeated sequence. */
|
|
status = build_REPEAT(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_GROUP:
|
|
/* A capture group. */
|
|
status = build_GROUP(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_GROUP_CALL:
|
|
/* A group call. */
|
|
status = build_GROUP_CALL(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_GROUP_EXISTS:
|
|
/* A conditional sequence. */
|
|
status = build_GROUP_EXISTS(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_LOOKAROUND:
|
|
/* A lookaround. */
|
|
status = build_LOOKAROUND(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_RANGE:
|
|
case RE_OP_RANGE_IGN:
|
|
case RE_OP_RANGE_IGN_REV:
|
|
case RE_OP_RANGE_REV:
|
|
/* A range. */
|
|
status = build_RANGE(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_REF_GROUP:
|
|
case RE_OP_REF_GROUP_FLD:
|
|
case RE_OP_REF_GROUP_FLD_REV:
|
|
case RE_OP_REF_GROUP_IGN:
|
|
case RE_OP_REF_GROUP_IGN_REV:
|
|
case RE_OP_REF_GROUP_REV:
|
|
/* A reference to a group. */
|
|
status = build_REF_GROUP(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_SET_DIFF:
|
|
case RE_OP_SET_DIFF_IGN:
|
|
case RE_OP_SET_DIFF_IGN_REV:
|
|
case RE_OP_SET_DIFF_REV:
|
|
case RE_OP_SET_INTER:
|
|
case RE_OP_SET_INTER_IGN:
|
|
case RE_OP_SET_INTER_IGN_REV:
|
|
case RE_OP_SET_INTER_REV:
|
|
case RE_OP_SET_SYM_DIFF:
|
|
case RE_OP_SET_SYM_DIFF_IGN:
|
|
case RE_OP_SET_SYM_DIFF_IGN_REV:
|
|
case RE_OP_SET_SYM_DIFF_REV:
|
|
case RE_OP_SET_UNION:
|
|
case RE_OP_SET_UNION_IGN:
|
|
case RE_OP_SET_UNION_IGN_REV:
|
|
case RE_OP_SET_UNION_REV:
|
|
/* A set. */
|
|
status = build_SET(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_STRING:
|
|
case RE_OP_STRING_FLD:
|
|
case RE_OP_STRING_FLD_REV:
|
|
case RE_OP_STRING_IGN:
|
|
case RE_OP_STRING_IGN_REV:
|
|
case RE_OP_STRING_REV:
|
|
/* A string literal. */
|
|
if (!build_STRING(args, FALSE))
|
|
return FALSE;
|
|
break;
|
|
case RE_OP_STRING_SET:
|
|
case RE_OP_STRING_SET_FLD:
|
|
case RE_OP_STRING_SET_FLD_REV:
|
|
case RE_OP_STRING_SET_IGN:
|
|
case RE_OP_STRING_SET_IGN_REV:
|
|
case RE_OP_STRING_SET_REV:
|
|
/* A reference to a list. */
|
|
status = build_STRING_SET(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
case RE_OP_SUCCESS:
|
|
/* Success. */
|
|
status = build_SUCCESS(args);
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return status;
|
|
break;
|
|
default:
|
|
/* We've found an opcode which we don't recognise. We'll leave it
|
|
* for the caller.
|
|
*/
|
|
return RE_ERROR_SUCCESS;
|
|
}
|
|
}
|
|
|
|
/* If we're here then we should be at the end of the code, otherwise we
|
|
* have an error.
|
|
*/
|
|
return args->code == args->end_code;
|
|
}
|
|
|
|
/* Compiles the regular expression code to 'nodes'.
|
|
*
|
|
* Various details about the regular expression are discovered during
|
|
* compilation and stored in the PatternObject.
|
|
*/
|
|
Py_LOCAL_INLINE(BOOL) compile_to_nodes(RE_CODE* code, RE_CODE* end_code,
|
|
PatternObject* pattern) {
|
|
RE_CompileArgs args;
|
|
int status;
|
|
|
|
/* Compile a regex sequence and then check that we've reached the end
|
|
* correctly. (The last opcode should be 'SUCCESS'.)
|
|
*
|
|
* If successful, 'start' and 'end' will point to the start and end nodes
|
|
* of the compiled sequence.
|
|
*/
|
|
args.code = code;
|
|
args.end_code = end_code;
|
|
args.pattern = pattern;
|
|
args.forward = (pattern->flags & RE_FLAG_REVERSE) == 0;
|
|
args.min_width = 0;
|
|
args.visible_captures = FALSE;
|
|
args.has_captures = FALSE;
|
|
args.repeat_depth = 0;
|
|
args.is_fuzzy = FALSE;
|
|
args.within_fuzzy = FALSE;
|
|
status = build_sequence(&args);
|
|
if (status == RE_ERROR_ILLEGAL)
|
|
set_error(RE_ERROR_ILLEGAL, NULL);
|
|
|
|
if (status != RE_ERROR_SUCCESS)
|
|
return FALSE;
|
|
|
|
pattern->min_width = args.min_width;
|
|
pattern->is_fuzzy = args.is_fuzzy;
|
|
pattern->do_search_start = TRUE;
|
|
pattern->start_node = args.start;
|
|
|
|
/* Optimise the pattern. */
|
|
if (!optimise_pattern(pattern))
|
|
return FALSE;
|
|
|
|
pattern->start_test = locate_test_start(pattern->start_node);
|
|
|
|
/* Get the call_ref for the entire pattern, if any. */
|
|
if (pattern->start_node->op == RE_OP_CALL_REF)
|
|
pattern->pattern_call_ref = (Py_ssize_t)pattern->start_node->values[0];
|
|
else
|
|
pattern->pattern_call_ref = -1;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
/* Gets the required characters for a regex.
|
|
*
|
|
* In the event of an error, it just pretends that there are no required
|
|
* characters.
|
|
*/
|
|
Py_LOCAL_INLINE(void) get_required_chars(PyObject* required_chars, RE_CODE**
|
|
req_chars, size_t* req_length) {
|
|
Py_ssize_t len;
|
|
RE_CODE* chars;
|
|
Py_ssize_t i;
|
|
|
|
*req_chars = NULL;
|
|
*req_length = 0;
|
|
|
|
len = PyTuple_GET_SIZE(required_chars);
|
|
if (len < 1 || PyErr_Occurred()) {
|
|
PyErr_Clear();
|
|
return;
|
|
}
|
|
|
|
chars = (RE_CODE*)re_alloc((size_t)len * sizeof(RE_CODE));
|
|
if (!chars)
|
|
goto error;
|
|
|
|
for (i = 0; i < len; i++) {
|
|
PyObject* o;
|
|
size_t value;
|
|
|
|
/* PyTuple_SET_ITEM borrows the reference. */
|
|
o = PyTuple_GET_ITEM(required_chars, i);
|
|
|
|
value = PyLong_AsUnsignedLong(o);
|
|
if ((Py_ssize_t)value == -1 && PyErr_Occurred())
|
|
goto error;
|
|
|
|
chars[i] = (RE_CODE)value;
|
|
if (chars[i] != value)
|
|
goto error;
|
|
}
|
|
|
|
*req_chars = chars;
|
|
*req_length = (size_t)len;
|
|
|
|
return;
|
|
|
|
error:
|
|
PyErr_Clear();
|
|
re_dealloc(chars);
|
|
}
|
|
|
|
/* Makes a STRING node. */
|
|
Py_LOCAL_INLINE(RE_Node*) make_STRING_node(PatternObject* pattern, RE_UINT8 op,
|
|
size_t length, RE_CODE* chars) {
|
|
Py_ssize_t step;
|
|
RE_Node* node;
|
|
size_t i;
|
|
|
|
step = get_step(op);
|
|
|
|
/* Create the node. */
|
|
node = create_node(pattern, op, 0, step * (Py_ssize_t)length, length);
|
|
if (!node)
|
|
return NULL;
|
|
|
|
node->status |= RE_STATUS_STRING;
|
|
|
|
for (i = 0; i < length; i++)
|
|
node->values[i] = chars[i];
|
|
|
|
return node;
|
|
}
|
|
|
|
/* Compiles regular expression code to a PatternObject.
|
|
*
|
|
* The regular expression code is provided as a list and is then compiled to
|
|
* 'nodes'. Various details about the regular expression are discovered during
|
|
* compilation and stored in the PatternObject.
|
|
*/
|
|
static PyObject* re_compile(PyObject* self_, PyObject* args) {
|
|
PyObject* pattern;
|
|
Py_ssize_t flags = 0;
|
|
PyObject* code_list;
|
|
PyObject* groupindex;
|
|
PyObject* indexgroup;
|
|
PyObject* named_lists;
|
|
PyObject* named_list_indexes;
|
|
Py_ssize_t req_offset;
|
|
PyObject* required_chars;
|
|
size_t req_length;
|
|
RE_CODE* req_chars;
|
|
Py_ssize_t req_flags;
|
|
size_t public_group_count;
|
|
Py_ssize_t code_len;
|
|
RE_CODE* code;
|
|
Py_ssize_t i;
|
|
PatternObject* self;
|
|
BOOL ascii;
|
|
BOOL locale;
|
|
BOOL unicode;
|
|
BOOL ok;
|
|
|
|
if (!PyArg_ParseTuple(args, "OnOOOOOnOnn:re_compile", &pattern, &flags,
|
|
&code_list, &groupindex, &indexgroup, &named_lists, &named_list_indexes,
|
|
&req_offset, &required_chars, &req_flags, &public_group_count))
|
|
return NULL;
|
|
|
|
/* Read the regex code. */
|
|
code_len = PyList_GET_SIZE(code_list);
|
|
code = (RE_CODE*)re_alloc((size_t)code_len * sizeof(RE_CODE));
|
|
if (!code)
|
|
return NULL;
|
|
|
|
for (i = 0; i < code_len; i++) {
|
|
PyObject* o;
|
|
size_t value;
|
|
|
|
/* PyList_GET_ITEM borrows a reference. */
|
|
o = PyList_GET_ITEM(code_list, i);
|
|
|
|
value = PyLong_AsUnsignedLong(o);
|
|
if ((Py_ssize_t)value == -1 && PyErr_Occurred())
|
|
goto error;
|
|
|
|
code[i] = (RE_CODE)value;
|
|
if (code[i] != value)
|
|
goto error;
|
|
}
|
|
|
|
/* Get the required characters. */
|
|
get_required_chars(required_chars, &req_chars, &req_length);
|
|
|
|
/* Create the PatternObject. */
|
|
self = PyObject_NEW(PatternObject, &Pattern_Type);
|
|
if (!self) {
|
|
set_error(RE_ERROR_MEMORY, NULL);
|
|
re_dealloc(req_chars);
|
|
re_dealloc(code);
|
|
return NULL;
|
|
}
|
|
|
|
/* Initialise the PatternObject. */
|
|
self->pattern = pattern;
|
|
self->flags = flags;
|
|
self->weakreflist = NULL;
|
|
self->start_node = NULL;
|
|
self->repeat_count = 0;
|
|
self->true_group_count = 0;
|
|
self->public_group_count = public_group_count;
|
|
self->group_end_index = 0;
|
|
self->groupindex = groupindex;
|
|
self->indexgroup = indexgroup;
|
|
self->named_lists = named_lists;
|
|
self->named_lists_count = (size_t)PyDict_Size(named_lists);
|
|
self->partial_named_lists[0] = NULL;
|
|
self->partial_named_lists[1] = NULL;
|
|
self->named_list_indexes = named_list_indexes;
|
|
self->node_capacity = 0;
|
|
self->node_count = 0;
|
|
self->node_list = NULL;
|
|
self->group_info_capacity = 0;
|
|
self->group_info = NULL;
|
|
self->call_ref_info_capacity = 0;
|
|
self->call_ref_info_count = 0;
|
|
self->call_ref_info = NULL;
|
|
self->repeat_info_capacity = 0;
|
|
self->repeat_info = NULL;
|
|
self->groups_storage = NULL;
|
|
self->repeats_storage = NULL;
|
|
self->fuzzy_count = 0;
|
|
self->recursive = FALSE;
|
|
self->req_offset = req_offset;
|
|
self->req_string = NULL;
|
|
Py_INCREF(self->pattern);
|
|
Py_INCREF(self->groupindex);
|
|
Py_INCREF(self->indexgroup);
|
|
Py_INCREF(self->named_lists);
|
|
Py_INCREF(self->named_list_indexes);
|
|
|
|
/* Initialise the character encoding. */
|
|
unicode = (flags & RE_FLAG_UNICODE) != 0;
|
|
locale = (flags & RE_FLAG_LOCALE) != 0;
|
|
ascii = (flags & RE_FLAG_ASCII) != 0;
|
|
if (!unicode && !locale && !ascii) {
|
|
if (PyString_Check(self->pattern))
|
|
ascii = RE_FLAG_ASCII;
|
|
else
|
|
unicode = RE_FLAG_UNICODE;
|
|
}
|
|
if (unicode)
|
|
self->encoding = &unicode_encoding;
|
|
else if (locale)
|
|
self->encoding = &locale_encoding;
|
|
else if (ascii)
|
|
self->encoding = &ascii_encoding;
|
|
|
|
/* Compile the regular expression code to nodes. */
|
|
ok = compile_to_nodes(code, code + code_len, self);
|
|
|
|
/* We no longer need the regular expression code. */
|
|
re_dealloc(code);
|
|
|
|
if (!ok) {
|
|
Py_DECREF(self);
|
|
re_dealloc(req_chars);
|
|
return NULL;
|
|
}
|
|
|
|
/* Make a node for the required string, if there's one. */
|
|
if (req_chars) {
|
|
/* Remove the FULLCASE flag if it's not a Unicode pattern. */
|
|
if (!(self->flags & RE_FLAG_UNICODE))
|
|
req_flags &= ~RE_FLAG_FULLCASE;
|
|
|
|
if (self->flags & RE_FLAG_REVERSE) {
|
|
switch (req_flags) {
|
|
case 0:
|
|
self->req_string = make_STRING_node(self, RE_OP_STRING_REV,
|
|
req_length, req_chars);
|
|
break;
|
|
case RE_FLAG_IGNORECASE | RE_FLAG_FULLCASE:
|
|
self->req_string = make_STRING_node(self, RE_OP_STRING_FLD_REV,
|
|
req_length, req_chars);
|
|
break;
|
|
case RE_FLAG_IGNORECASE:
|
|
self->req_string = make_STRING_node(self, RE_OP_STRING_IGN_REV,
|
|
req_length, req_chars);
|
|
break;
|
|
}
|
|
} else {
|
|
switch (req_flags) {
|
|
case 0:
|
|
self->req_string = make_STRING_node(self, RE_OP_STRING,
|
|
req_length, req_chars);
|
|
break;
|
|
case RE_FLAG_IGNORECASE | RE_FLAG_FULLCASE:
|
|
self->req_string = make_STRING_node(self, RE_OP_STRING_FLD,
|
|
req_length, req_chars);
|
|
break;
|
|
case RE_FLAG_IGNORECASE:
|
|
self->req_string = make_STRING_node(self, RE_OP_STRING_IGN,
|
|
req_length, req_chars);
|
|
break;
|
|
}
|
|
}
|
|
|
|
re_dealloc(req_chars);
|
|
}
|
|
|
|
return (PyObject*)self;
|
|
|
|
error:
|
|
re_dealloc(code);
|
|
set_error(RE_ERROR_ILLEGAL, NULL);
|
|
return NULL;
|
|
}
|
|
|
|
/* Gets the size of the codewords. */
|
|
static PyObject* get_code_size(PyObject* self, PyObject* unused) {
|
|
return Py_BuildValue("n", sizeof(RE_CODE));
|
|
}
|
|
|
|
/* Gets the property dict. */
|
|
static PyObject* get_properties(PyObject* self_, PyObject* args) {
|
|
Py_INCREF(property_dict);
|
|
|
|
return property_dict;
|
|
}
|
|
|
|
/* Folds the case of a string. */
|
|
static PyObject* fold_case(PyObject* self_, PyObject* args) {
|
|
RE_StringInfo str_info;
|
|
Py_UCS4 (*char_at)(void* text, Py_ssize_t pos);
|
|
Py_ssize_t folded_charsize;
|
|
void (*set_char_at)(void* text, Py_ssize_t pos, Py_UCS4 ch);
|
|
RE_EncodingTable* encoding;
|
|
Py_ssize_t buf_size;
|
|
void* folded;
|
|
Py_ssize_t folded_len;
|
|
PyObject* result;
|
|
|
|
Py_ssize_t flags;
|
|
PyObject* string;
|
|
if (!PyArg_ParseTuple(args, "nO:fold_case", &flags, &string))
|
|
return NULL;
|
|
|
|
if (!(flags & RE_FLAG_IGNORECASE)) {
|
|
Py_INCREF(string);
|
|
return string;
|
|
}
|
|
|
|
/* Get the string. */
|
|
if (!get_string(string, &str_info))
|
|
return NULL;
|
|
|
|
/* Get the function for reading from the original string. */
|
|
switch (str_info.charsize) {
|
|
case 1:
|
|
char_at = bytes1_char_at;
|
|
break;
|
|
case 2:
|
|
char_at = bytes2_char_at;
|
|
break;
|
|
case 4:
|
|
char_at = bytes4_char_at;
|
|
break;
|
|
default:
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return NULL;
|
|
}
|
|
|
|
/* What's the encoding? */
|
|
if (flags & RE_FLAG_UNICODE)
|
|
encoding = &unicode_encoding;
|
|
else if (flags & RE_FLAG_LOCALE)
|
|
encoding = &locale_encoding;
|
|
else if (flags & RE_FLAG_ASCII)
|
|
encoding = &ascii_encoding;
|
|
else
|
|
encoding = &unicode_encoding;
|
|
|
|
/* The folded string will have the same width as the original string. */
|
|
folded_charsize = str_info.charsize;
|
|
|
|
/* Get the function for writing to the folded string. */
|
|
switch (folded_charsize) {
|
|
case 1:
|
|
set_char_at = bytes1_set_char_at;
|
|
break;
|
|
case 2:
|
|
set_char_at = bytes2_set_char_at;
|
|
break;
|
|
case 4:
|
|
set_char_at = bytes4_set_char_at;
|
|
break;
|
|
default:
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return NULL;
|
|
}
|
|
|
|
/* Allocate a buffer for the folded string. */
|
|
if (flags & RE_FLAG_FULLCASE)
|
|
/* When using full case-folding with Unicode, some single codepoints
|
|
* are mapped to multiple codepoints.
|
|
*/
|
|
buf_size = str_info.length * RE_MAX_FOLDED;
|
|
else
|
|
buf_size = str_info.length;
|
|
|
|
folded = re_alloc((size_t)(buf_size * folded_charsize));
|
|
if (!folded) {
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return NULL;
|
|
}
|
|
|
|
/* Fold the case of the string. */
|
|
folded_len = 0;
|
|
|
|
if (flags & RE_FLAG_FULLCASE) {
|
|
/* Full case-folding. */
|
|
int (*full_case_fold)(Py_UCS4 ch, Py_UCS4* folded);
|
|
Py_ssize_t i;
|
|
Py_UCS4 codepoints[RE_MAX_FOLDED];
|
|
|
|
full_case_fold = encoding->full_case_fold;
|
|
|
|
for (i = 0; i < str_info.length; i++) {
|
|
int count;
|
|
int j;
|
|
|
|
count = full_case_fold(char_at(str_info.characters, i),
|
|
codepoints);
|
|
for (j = 0; j < count; j++)
|
|
set_char_at(folded, folded_len + j, codepoints[j]);
|
|
|
|
folded_len += count;
|
|
}
|
|
} else {
|
|
/* Simple case-folding. */
|
|
Py_UCS4 (*simple_case_fold)(Py_UCS4 ch);
|
|
Py_ssize_t i;
|
|
|
|
simple_case_fold = encoding->simple_case_fold;
|
|
|
|
for (i = 0; i < str_info.length; i++) {
|
|
Py_UCS4 ch;
|
|
|
|
ch = simple_case_fold(char_at(str_info.characters, i));
|
|
set_char_at(folded, i, ch);
|
|
}
|
|
|
|
folded_len = str_info.length;
|
|
}
|
|
|
|
/* Build the result string. */
|
|
if (str_info.is_unicode)
|
|
result = build_unicode_value(folded, folded_len, folded_charsize);
|
|
else
|
|
result = build_bytes_value(folded, folded_len, folded_charsize);
|
|
|
|
re_dealloc(folded);
|
|
|
|
#if PY_VERSION_HEX >= 0x02060000
|
|
/* Release the original string's buffer. */
|
|
release_buffer(&str_info);
|
|
|
|
#endif
|
|
return result;
|
|
}
|
|
|
|
/* Returns a tuple of the Unicode characters that expand on full case-folding.
|
|
*/
|
|
static PyObject* get_expand_on_folding(PyObject* self, PyObject* unused) {
|
|
int count;
|
|
int i;
|
|
PyObject* result;
|
|
|
|
/* How many characters are there? */
|
|
count = sizeof(re_expand_on_folding) / sizeof(re_expand_on_folding[0]);
|
|
|
|
/* Put all the characters in a tuple. */
|
|
result = PyTuple_New(count);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
for (i = 0; i < count; i++) {
|
|
Py_UNICODE codepoint;
|
|
PyObject* item;
|
|
|
|
codepoint = re_expand_on_folding[i];
|
|
|
|
item = build_unicode_value(&codepoint, 1, sizeof(codepoint));
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyTuple_SetItem borrows the reference. */
|
|
PyTuple_SetItem(result, i, item);
|
|
}
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* Returns whether a character has a given value for a Unicode property. */
|
|
static PyObject* has_property_value(PyObject* self_, PyObject* args) {
|
|
BOOL v;
|
|
|
|
Py_ssize_t property_value;
|
|
Py_ssize_t character;
|
|
if (!PyArg_ParseTuple(args, "nn:has_property_value", &property_value,
|
|
&character))
|
|
return NULL;
|
|
|
|
v = unicode_has_property((RE_CODE)property_value, (Py_UCS4)character) ? 1 :
|
|
0;
|
|
|
|
return Py_BuildValue("n", v);
|
|
}
|
|
|
|
/* Returns a list of all the simple cases of a character.
|
|
*
|
|
* If full case-folding is turned on and the character also expands on full
|
|
* case-folding, a None is appended to the list.
|
|
*/
|
|
static PyObject* get_all_cases(PyObject* self_, PyObject* args) {
|
|
RE_EncodingTable* encoding;
|
|
int count;
|
|
Py_UCS4 cases[RE_MAX_CASES];
|
|
Py_UCS4 folded[RE_MAX_FOLDED];
|
|
PyObject* result;
|
|
int i;
|
|
|
|
Py_ssize_t flags;
|
|
Py_ssize_t character;
|
|
if (!PyArg_ParseTuple(args, "nn:get_all_cases", &flags, &character))
|
|
return NULL;
|
|
|
|
/* What's the encoding? */
|
|
if (flags & RE_FLAG_UNICODE)
|
|
encoding = &unicode_encoding;
|
|
else if (flags & RE_FLAG_LOCALE)
|
|
encoding = &locale_encoding;
|
|
else if (flags & RE_FLAG_ASCII)
|
|
encoding = &ascii_encoding;
|
|
else
|
|
encoding = &ascii_encoding;
|
|
|
|
/* Get all the simple cases. */
|
|
count = encoding->all_cases((Py_UCS4)character, cases);
|
|
|
|
result = PyList_New(count);
|
|
if (!result)
|
|
return NULL;
|
|
|
|
for (i = 0; i < count; i++) {
|
|
PyObject* item;
|
|
|
|
item = Py_BuildValue("n", cases[i]);
|
|
if (!item)
|
|
goto error;
|
|
|
|
/* PyList_SetItem borrows the reference. */
|
|
PyList_SetItem(result, i, item);
|
|
}
|
|
|
|
/* If the character also expands on full case-folding, append a None. */
|
|
if ((flags & RE_FULL_CASE_FOLDING) == RE_FULL_CASE_FOLDING) {
|
|
count = encoding->full_case_fold((Py_UCS4)character, folded);
|
|
if (count > 1)
|
|
PyList_Append(result, Py_None);
|
|
}
|
|
|
|
return result;
|
|
|
|
error:
|
|
Py_DECREF(result);
|
|
return NULL;
|
|
}
|
|
|
|
/* The table of the module's functions. */
|
|
static PyMethodDef _functions[] = {
|
|
{"compile", (PyCFunction)re_compile, METH_VARARGS},
|
|
{"get_code_size", (PyCFunction)get_code_size, METH_NOARGS},
|
|
{"get_properties", (PyCFunction)get_properties, METH_VARARGS},
|
|
{"fold_case", (PyCFunction)fold_case, METH_VARARGS},
|
|
{"get_expand_on_folding", (PyCFunction)get_expand_on_folding, METH_NOARGS},
|
|
{"has_property_value", (PyCFunction)has_property_value, METH_VARARGS},
|
|
{"get_all_cases", (PyCFunction)get_all_cases, METH_VARARGS},
|
|
{NULL, NULL}
|
|
};
|
|
|
|
/* Initialises the property dictionary. */
|
|
Py_LOCAL_INLINE(BOOL) init_property_dict(void) {
|
|
size_t value_set_count;
|
|
size_t i;
|
|
PyObject** value_dicts;
|
|
|
|
property_dict = NULL;
|
|
|
|
/* How many value sets are there? */
|
|
value_set_count = 0;
|
|
|
|
for (i = 0; i < sizeof(re_property_values) / sizeof(re_property_values[0]);
|
|
i++) {
|
|
RE_PropertyValue* value;
|
|
|
|
value = &re_property_values[i];
|
|
if (value->value_set >= value_set_count)
|
|
value_set_count = (size_t)value->value_set + 1;
|
|
}
|
|
|
|
/* Quick references for the value sets. */
|
|
value_dicts = (PyObject**)re_alloc(value_set_count *
|
|
sizeof(value_dicts[0]));
|
|
if (!value_dicts)
|
|
return FALSE;
|
|
|
|
memset(value_dicts, 0, value_set_count * sizeof(value_dicts[0]));
|
|
|
|
/* Build the property values dictionaries. */
|
|
for (i = 0; i < sizeof(re_property_values) / sizeof(re_property_values[0]);
|
|
i++) {
|
|
RE_PropertyValue* value;
|
|
PyObject* v;
|
|
int status;
|
|
|
|
value = &re_property_values[i];
|
|
if (!value_dicts[value->value_set]) {
|
|
value_dicts[value->value_set] = PyDict_New();
|
|
if (!value_dicts[value->value_set])
|
|
goto error;
|
|
}
|
|
|
|
v = Py_BuildValue("i", value->id);
|
|
if (!v)
|
|
goto error;
|
|
|
|
status = PyDict_SetItemString(value_dicts[value->value_set],
|
|
re_strings[value->name], v);
|
|
Py_DECREF(v);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
|
|
/* Build the property dictionary. */
|
|
property_dict = PyDict_New();
|
|
if (!property_dict)
|
|
goto error;
|
|
|
|
for (i = 0; i < sizeof(re_properties) / sizeof(re_properties[0]); i++) {
|
|
RE_Property* property;
|
|
PyObject* v;
|
|
int status;
|
|
|
|
property = &re_properties[i];
|
|
v = Py_BuildValue("iO", property->id,
|
|
value_dicts[property->value_set]);
|
|
if (!v)
|
|
goto error;
|
|
|
|
status = PyDict_SetItemString(property_dict,
|
|
re_strings[property->name], v);
|
|
Py_DECREF(v);
|
|
if (status < 0)
|
|
goto error;
|
|
}
|
|
|
|
/* DECREF the value sets. Any unused ones will be deallocated. */
|
|
for (i = 0; i < value_set_count; i++)
|
|
Py_XDECREF(value_dicts[i]);
|
|
|
|
re_dealloc(value_dicts);
|
|
|
|
return TRUE;
|
|
|
|
error:
|
|
Py_XDECREF(property_dict);
|
|
|
|
/* DECREF the value sets. */
|
|
for (i = 0; i < value_set_count; i++)
|
|
Py_XDECREF(value_dicts[i]);
|
|
|
|
re_dealloc(value_dicts);
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
/* Initialises the module. */
|
|
PyMODINIT_FUNC init_regex(void) {
|
|
PyObject* m;
|
|
PyObject* d;
|
|
PyObject* x;
|
|
|
|
#if defined(VERBOSE)
|
|
/* Unbuffered in case it crashes! */
|
|
setvbuf(stdout, NULL, _IONBF, 0);
|
|
|
|
#endif
|
|
/* Initialise Pattern_Type. */
|
|
Pattern_Type.tp_dealloc = pattern_dealloc;
|
|
Pattern_Type.tp_repr = pattern_repr;
|
|
Pattern_Type.tp_flags = Py_TPFLAGS_HAVE_WEAKREFS;
|
|
Pattern_Type.tp_doc = pattern_doc;
|
|
Pattern_Type.tp_weaklistoffset = offsetof(PatternObject, weakreflist);
|
|
Pattern_Type.tp_methods = pattern_methods;
|
|
Pattern_Type.tp_members = pattern_members;
|
|
Pattern_Type.tp_getset = pattern_getset;
|
|
|
|
/* Initialise Match_Type. */
|
|
Match_Type.tp_dealloc = match_dealloc;
|
|
Match_Type.tp_repr = match_repr;
|
|
Match_Type.tp_as_mapping = &match_as_mapping;
|
|
Match_Type.tp_flags = Py_TPFLAGS_DEFAULT;
|
|
Match_Type.tp_doc = match_doc;
|
|
Match_Type.tp_methods = match_methods;
|
|
Match_Type.tp_members = match_members;
|
|
Match_Type.tp_getset = match_getset;
|
|
|
|
/* Initialise Scanner_Type. */
|
|
Scanner_Type.tp_dealloc = scanner_dealloc;
|
|
Scanner_Type.tp_flags = Py_TPFLAGS_DEFAULT;
|
|
Scanner_Type.tp_doc = scanner_doc;
|
|
Scanner_Type.tp_iter = scanner_iter;
|
|
Scanner_Type.tp_iternext = scanner_iternext;
|
|
Scanner_Type.tp_methods = scanner_methods;
|
|
Scanner_Type.tp_members = scanner_members;
|
|
|
|
/* Initialise Splitter_Type. */
|
|
Splitter_Type.tp_dealloc = splitter_dealloc;
|
|
Splitter_Type.tp_flags = Py_TPFLAGS_DEFAULT;
|
|
Splitter_Type.tp_doc = splitter_doc;
|
|
Splitter_Type.tp_iter = splitter_iter;
|
|
Splitter_Type.tp_iternext = splitter_iternext;
|
|
Splitter_Type.tp_methods = splitter_methods;
|
|
Splitter_Type.tp_members = splitter_members;
|
|
|
|
/* Initialize object types */
|
|
if (PyType_Ready(&Pattern_Type) < 0)
|
|
return;
|
|
if (PyType_Ready(&Match_Type) < 0)
|
|
return;
|
|
if (PyType_Ready(&Scanner_Type) < 0)
|
|
return;
|
|
if (PyType_Ready(&Splitter_Type) < 0)
|
|
return;
|
|
|
|
error_exception = NULL;
|
|
|
|
m = Py_InitModule("_" RE_MODULE, _functions);
|
|
if (!m)
|
|
return;
|
|
|
|
d = PyModule_GetDict(m);
|
|
|
|
x = PyInt_FromLong(RE_MAGIC);
|
|
if (x) {
|
|
PyDict_SetItemString(d, "MAGIC", x);
|
|
Py_DECREF(x);
|
|
}
|
|
|
|
x = PyInt_FromLong(sizeof(RE_CODE));
|
|
if (x) {
|
|
PyDict_SetItemString(d, "CODE_SIZE", x);
|
|
Py_DECREF(x);
|
|
}
|
|
|
|
x = PyString_FromString(copyright);
|
|
if (x) {
|
|
PyDict_SetItemString(d, "copyright", x);
|
|
Py_DECREF(x);
|
|
}
|
|
|
|
/* Initialise the property dictionary. */
|
|
if (!init_property_dict())
|
|
return;
|
|
}
|
|
|
|
/* vim:ts=4:sw=4:et */
|