322 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			322 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| /*
 | |
|  *
 | |
|  * Copyright (c) 1998-2002
 | |
|  * John Maddock
 | |
|  *
 | |
|  * Use, modification and distribution are subject to the 
 | |
|  * Boost Software License, Version 1.0. (See accompanying file 
 | |
|  * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 | |
|  *
 | |
|  */
 | |
| 
 | |
|  /*
 | |
|   *   LOCATION:    see http://www.boost.org for most recent version.
 | |
|   *   FILE         states.cpp
 | |
|   *   VERSION      see <boost/version.hpp>
 | |
|   *   DESCRIPTION: Declares internal state machine structures.
 | |
|   */
 | |
| 
 | |
| #ifndef BOOST_REGEX_V4_STATES_HPP
 | |
| #define BOOST_REGEX_V4_STATES_HPP
 | |
| 
 | |
| #ifdef BOOST_MSVC
 | |
| #pragma warning(push)
 | |
| #pragma warning(disable: 4103)
 | |
| #endif
 | |
| #ifdef BOOST_HAS_ABI_HEADERS
 | |
| #  include BOOST_ABI_PREFIX
 | |
| #endif
 | |
| #ifdef BOOST_MSVC
 | |
| #pragma warning(pop)
 | |
| #endif
 | |
| 
 | |
| namespace boost{
 | |
| namespace BOOST_REGEX_DETAIL_NS{
 | |
| 
 | |
| /*** mask_type *******************************************************
 | |
| Whenever we have a choice of two alternatives, we use an array of bytes
 | |
| to indicate which of the two alternatives it is possible to take for any
 | |
| given input character.  If mask_take is set, then we can take the next 
 | |
| state, and if mask_skip is set then we can take the alternative.
 | |
| ***********************************************************************/
 | |
| enum mask_type
 | |
| {
 | |
|    mask_take = 1,
 | |
|    mask_skip = 2,
 | |
|    mask_init = 4,
 | |
|    mask_any = mask_skip | mask_take,
 | |
|    mask_all = mask_any
 | |
| };
 | |
| 
 | |
| /*** helpers **********************************************************
 | |
| These helpers let us use function overload resolution to detect whether
 | |
| we have narrow or wide character strings:
 | |
| ***********************************************************************/
 | |
| struct _narrow_type{};
 | |
| struct _wide_type{};
 | |
| template <class charT> struct is_byte;
 | |
| template<>             struct is_byte<char>         { typedef _narrow_type width_type; };
 | |
| template<>             struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
 | |
| template<>             struct is_byte<signed char>  { typedef _narrow_type width_type; };
 | |
| template <class charT> struct is_byte               { typedef _wide_type width_type; };
 | |
| 
 | |
| /*** enum syntax_element_type ******************************************
 | |
| Every record in the state machine falls into one of the following types:
 | |
| ***********************************************************************/
 | |
| enum syntax_element_type
 | |
| {
 | |
|    // start of a marked sub-expression, or perl-style (?...) extension
 | |
|    syntax_element_startmark = 0,
 | |
|    // end of a marked sub-expression, or perl-style (?...) extension
 | |
|    syntax_element_endmark = syntax_element_startmark + 1,
 | |
|    // any sequence of literal characters
 | |
|    syntax_element_literal = syntax_element_endmark + 1,
 | |
|    // start of line assertion: ^
 | |
|    syntax_element_start_line = syntax_element_literal + 1,
 | |
|    // end of line assertion $
 | |
|    syntax_element_end_line = syntax_element_start_line + 1,
 | |
|    // match any character: .
 | |
|    syntax_element_wild = syntax_element_end_line + 1,
 | |
|    // end of expression: we have a match when we get here
 | |
|    syntax_element_match = syntax_element_wild + 1,
 | |
|    // perl style word boundary: \b
 | |
|    syntax_element_word_boundary = syntax_element_match + 1,
 | |
|    // perl style within word boundary: \B
 | |
|    syntax_element_within_word = syntax_element_word_boundary + 1,
 | |
|    // start of word assertion: \<
 | |
|    syntax_element_word_start = syntax_element_within_word + 1,
 | |
|    // end of word assertion: \>
 | |
|    syntax_element_word_end = syntax_element_word_start + 1,
 | |
|    // start of buffer assertion: \`
 | |
|    syntax_element_buffer_start = syntax_element_word_end + 1,
 | |
|    // end of buffer assertion: \'
 | |
|    syntax_element_buffer_end = syntax_element_buffer_start + 1,
 | |
|    // backreference to previously matched sub-expression
 | |
|    syntax_element_backref = syntax_element_buffer_end + 1,
 | |
|    // either a wide character set [..] or one with multicharacter collating elements:
 | |
|    syntax_element_long_set = syntax_element_backref + 1,
 | |
|    // narrow character set: [...]
 | |
|    syntax_element_set = syntax_element_long_set + 1,
 | |
|    // jump to a new state in the machine:
 | |
|    syntax_element_jump = syntax_element_set + 1,
 | |
|    // choose between two production states:
 | |
|    syntax_element_alt = syntax_element_jump + 1,
 | |
|    // a repeat
 | |
|    syntax_element_rep = syntax_element_alt + 1,
 | |
|    // match a combining character sequence
 | |
|    syntax_element_combining = syntax_element_rep + 1,
 | |
|    // perl style soft buffer end: \z
 | |
|    syntax_element_soft_buffer_end = syntax_element_combining + 1,
 | |
|    // perl style continuation: \G
 | |
|    syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
 | |
|    // single character repeats:
 | |
|    syntax_element_dot_rep = syntax_element_restart_continue + 1,
 | |
|    syntax_element_char_rep = syntax_element_dot_rep + 1,
 | |
|    syntax_element_short_set_rep = syntax_element_char_rep + 1,
 | |
|    syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
 | |
|    // a backstep for lookbehind repeats:
 | |
|    syntax_element_backstep = syntax_element_long_set_rep + 1,
 | |
|    // an assertion that a mark was matched:
 | |
|    syntax_element_assert_backref = syntax_element_backstep + 1,
 | |
|    syntax_element_toggle_case = syntax_element_assert_backref + 1,
 | |
|    // a recursive expression:
 | |
|    syntax_element_recurse = syntax_element_toggle_case + 1,
 | |
|    // Verbs:
 | |
|    syntax_element_fail = syntax_element_recurse + 1,
 | |
|    syntax_element_accept = syntax_element_fail + 1,
 | |
|    syntax_element_commit = syntax_element_accept + 1,
 | |
|    syntax_element_then = syntax_element_commit + 1
 | |
| };
 | |
| 
 | |
| #ifdef BOOST_REGEX_DEBUG
 | |
| // dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
 | |
| std::ostream& operator<<(std::ostream&, syntax_element_type);
 | |
| #endif
 | |
| 
 | |
| struct re_syntax_base;
 | |
| 
 | |
| /*** union offset_type ************************************************
 | |
| Points to another state in the machine.  During machine construction
 | |
| we use integral offsets, but these are converted to pointers before
 | |
| execution of the machine.
 | |
| ***********************************************************************/
 | |
| union offset_type
 | |
| {
 | |
|    re_syntax_base*   p;
 | |
|    std::ptrdiff_t    i;
 | |
| };
 | |
| 
 | |
| /*** struct re_syntax_base ********************************************
 | |
| Base class for all states in the machine.
 | |
| ***********************************************************************/
 | |
| struct re_syntax_base
 | |
| {
 | |
|    syntax_element_type   type;         // what kind of state this is
 | |
|    offset_type           next;         // next state in the machine
 | |
| };
 | |
| 
 | |
| /*** struct re_brace **************************************************
 | |
| A marked parenthesis.
 | |
| ***********************************************************************/
 | |
| struct re_brace : public re_syntax_base
 | |
| {
 | |
|    // The index to match, can be zero (don't mark the sub-expression)
 | |
|    // or negative (for perl style (?...) extentions):
 | |
|    int index;
 | |
|    bool icase;
 | |
| };
 | |
| 
 | |
| /*** struct re_dot **************************************************
 | |
| Match anything.
 | |
| ***********************************************************************/
 | |
| enum
 | |
| {
 | |
|    dont_care = 1,
 | |
|    force_not_newline = 0,
 | |
|    force_newline = 2,
 | |
| 
 | |
|    test_not_newline = 2,
 | |
|    test_newline = 3
 | |
| };
 | |
| struct re_dot : public re_syntax_base
 | |
| {
 | |
|    unsigned char mask;
 | |
| };
 | |
| 
 | |
| /*** struct re_literal ************************************************
 | |
| A string of literals, following this structure will be an 
 | |
| array of characters: charT[length]
 | |
| ***********************************************************************/
 | |
| struct re_literal : public re_syntax_base
 | |
| {
 | |
|    unsigned int length;
 | |
| };
 | |
| 
 | |
| /*** struct re_case ************************************************
 | |
| Indicates whether we are moving to a case insensive block or not
 | |
| ***********************************************************************/
 | |
| struct re_case : public re_syntax_base
 | |
| {
 | |
|    bool icase;
 | |
| };
 | |
| 
 | |
| /*** struct re_set_long ***********************************************
 | |
| A wide character set of characters, following this structure will be
 | |
| an array of type charT:
 | |
| First csingles null-terminated strings
 | |
| Then 2 * cranges NULL terminated strings
 | |
| Then cequivalents NULL terminated strings
 | |
| ***********************************************************************/
 | |
| template <class mask_type>
 | |
| struct re_set_long : public re_syntax_base
 | |
| {
 | |
|    unsigned int            csingles, cranges, cequivalents;
 | |
|    mask_type               cclasses;
 | |
|    mask_type               cnclasses;
 | |
|    bool                    isnot;
 | |
|    bool                    singleton;
 | |
| };
 | |
| 
 | |
| /*** struct re_set ****************************************************
 | |
| A set of narrow-characters, matches any of _map which is none-zero
 | |
| ***********************************************************************/
 | |
| struct re_set : public re_syntax_base
 | |
| {
 | |
|    unsigned char _map[1 << CHAR_BIT];
 | |
| };
 | |
| 
 | |
| /*** struct re_jump ***************************************************
 | |
| Jump to a new location in the machine (not next).
 | |
| ***********************************************************************/
 | |
| struct re_jump : public re_syntax_base
 | |
| {
 | |
|    offset_type     alt;                 // location to jump to
 | |
| };
 | |
| 
 | |
| /*** struct re_alt ***************************************************
 | |
| Jump to a new location in the machine (possibly next).
 | |
| ***********************************************************************/
 | |
| struct re_alt : public re_jump
 | |
| {
 | |
|    unsigned char   _map[1 << CHAR_BIT]; // which characters can take the jump
 | |
|    unsigned int    can_be_null;         // true if we match a NULL string
 | |
| };
 | |
| 
 | |
| /*** struct re_repeat *************************************************
 | |
| Repeat a section of the machine
 | |
| ***********************************************************************/
 | |
| struct re_repeat : public re_alt
 | |
| {
 | |
|    std::size_t   min, max;  // min and max allowable repeats
 | |
|    int           state_id;        // Unique identifier for this repeat
 | |
|    bool          leading;   // True if this repeat is at the start of the machine (lets us optimize some searches)
 | |
|    bool          greedy;    // True if this is a greedy repeat
 | |
| };
 | |
| 
 | |
| /*** struct re_recurse ************************************************
 | |
| Recurse to a particular subexpression.
 | |
| **********************************************************************/
 | |
| struct re_recurse : public re_jump
 | |
| {
 | |
|    int state_id;             // identifier of first nested repeat within the recursion.
 | |
| };
 | |
| 
 | |
| /*** struct re_commit *************************************************
 | |
| Used for the PRUNE, SKIP and COMMIT verbs which basically differ only in what happens
 | |
| if no match is found and we start searching forward.
 | |
| **********************************************************************/
 | |
| enum commit_type
 | |
| {
 | |
|    commit_prune,
 | |
|    commit_skip,
 | |
|    commit_commit
 | |
| };
 | |
| struct re_commit : public re_syntax_base
 | |
| {
 | |
|    commit_type action;
 | |
| };
 | |
| 
 | |
| /*** enum re_jump_size_type *******************************************
 | |
| Provides compiled size of re_jump structure (allowing for trailing alignment).
 | |
| We provide this so we know how manybytes to insert when constructing the machine
 | |
| (The value of padding_mask is defined in regex_raw_buffer.hpp).
 | |
| ***********************************************************************/
 | |
| enum re_jump_size_type
 | |
| {
 | |
|    re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
 | |
|    re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
 | |
|    re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
 | |
| };
 | |
| 
 | |
| /*** proc re_is_set_member *********************************************
 | |
| Forward declaration: we'll need this one later...
 | |
| ***********************************************************************/
 | |
| 
 | |
| template<class charT, class traits>
 | |
| struct regex_data;
 | |
| 
 | |
| template <class iterator, class charT, class traits_type, class char_classT>
 | |
| iterator BOOST_REGEX_CALL re_is_set_member(iterator next, 
 | |
|                           iterator last, 
 | |
|                           const re_set_long<char_classT>* set_, 
 | |
|                           const regex_data<charT, traits_type>& e, bool icase);
 | |
| 
 | |
| } // namespace BOOST_REGEX_DETAIL_NS
 | |
| 
 | |
| } // namespace boost
 | |
| 
 | |
| #ifdef BOOST_MSVC
 | |
| #pragma warning(push)
 | |
| #pragma warning(disable: 4103)
 | |
| #endif
 | |
| #ifdef BOOST_HAS_ABI_HEADERS
 | |
| #  include BOOST_ABI_SUFFIX
 | |
| #endif
 | |
| #ifdef BOOST_MSVC
 | |
| #pragma warning(pop)
 | |
| #endif
 | |
| 
 | |
| #endif
 | |
| 
 | |
| 
 | 
