/*
 * Copyright (C) 1992 by Software Research Associates, Inc.
 *      Author: Y. Kawabe <kawabe@sra.co.jp>
 *
 * Permission to use, copy, modify, and distribute, and sell this software
 * and its documentation for any purpose and without fee is hereby granted,
 * provided that the above copyright notice appear in all copies and that
 * both that copyright notice and this permission notice appear in supporting
 * documentation, and that the name of Software Research Associates not be
 * used in advertising or publicity pertaining to distribution of the software
 * without specific, written prior permission.  Software Research Associates
 * makes no representations about the suitability of this software for any
 * purpose.  It is provided "as is" without express or implied warranty.
 *
 * This version is based on the Tatu Ylonen public domain implementation of
 * the regular expression matching subroutines.
 *
 **********************************************************************
 *
 * Author: Tatu Ylonen <ylo@ngs.fi>
 * 
 * Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
 *
 * Permission to use, copy, modify, distribute, and sell this software
 * and its documentation is hereby granted without fee, provided that the
 * above copyright notice appears in all source code copies, the name of
 * Tatu Ylonen is not used to advertise products containing this software
 * or a derivation thereof, and all modified versions are clearly marked
 * as such.
 * 
 * This software is provided "as is" without express or implied warranty.
 *
 * Created: Thu Sep 26 17:14:05 1991 ylo
 * Last modified: Sun Mar 29 16:47:31 1992 ylo
 * 
 * This code draws many ideas from the regular expression packages by
 * Henry Spencer of the University of Toronto and Richard Stallman of
 * the Free Software Foundation.
 *
 **********************************************************************
 */

#include <NLS/wchar.h>
#include <NLS/charset.h>
#include <NLS/wstring.h>
#include <NLS/wregexp.h>
#include <NLS/nlsfile.h>
#include <NLS/locale.h>
#include <OS/list.h>
#include <OS/memory.h>
#include <OS/string.h>
#include <OS/table.h>
#include <assert.h>
#include <memory.h>
#include <strings.h>
#include <stdio.h>
#include <stdlib.h>

#ifndef IV_REGEXP_PATH
#define IV_REGEXP_PATH         "Regexp"
#endif

/*
 * class HitItem
 */

const int MaxChar = -1;	

class HitItem {
  public:
    HitItem ();
    HitItem (CharSet_T);
    HitItem (CharSet_T, int index);
    HitItem (CharSet_T, int start, int end);
    boolean between (CharSet_T, int) const;

  private:
    friend class HitMap;

    int			start_;
    int			end_;
    int			charset_;
};

HitItem::HitItem () { };

HitItem::HitItem (CharSet_T charset) {
    charset_ = charset;
    start_ = 0;
    end_ = MaxChar;
}

HitItem::HitItem (CharSet_T charset, int index) {
    charset_ = charset;
    start_ = end_ = index;
}

HitItem::HitItem (CharSet_T charset, int start, int end) {
    charset_ = charset;
    start_ = start;
    end_ = end;
}

inline boolean HitItem::between(CharSet_T charset, int charcode) const {
    return charset == charset_ &&
	((end_ == MaxChar) ? start_ <= charcode : 
	 (start_ <= charcode && charcode <= end_));
}

/*
 * HitItemList
 */

declareList (HitItemList, HitItem);
implementList (HitItemList, HitItem);

/*
 * class HitMap
 */

class HitMap {
  public:
    HitMap (boolean = false);
    ~HitMap ();
    
  public:
    boolean hit () const ;
    boolean hit (CharSet_T) const ;
    boolean hit (const WChar&) const ;
    boolean hit (const HitMap&) const ;

  public:
    void clear ();
    void append ();
    void append (CharSet_T);
    void append (const WChar&);
    void append (const WChar&, const WChar&);
    void append (const HitMap&);
    
  public:
    HitMap& operator = (const HitMap&);
    
  private:
    boolean hit (CharSet_T, int) const ;
    boolean hit (CharSet_T, int, int) const ;
    
    boolean	default_;	/* any or notany */
    HitItemList *match_;
};

HitMap::HitMap (boolean any) {
    default_ = any;
    match_= new HitItemList (4);
}

HitMap::~HitMap () {
    delete match_;
}

HitMap& HitMap::operator = (const HitMap& he) {
    default_ = he.default_;
    match_->remove_all();
    for (ListItr(HitItemList) i (*(he.match_)) ; i.more(); i.next()) {
	match_->append(i.cur());
    }
    return (HitMap) *this;
}

void HitMap::clear () {
    default_ = false;
    match_->remove_all();
}

void HitMap::append () {
    boolean newline = hit(CharSet::ascii(), '\n');
    default_ = true;
    match_->remove_all();
    if (!newline) {
	HitItem item (CharSet::ascii(), '\n');
	match_->append(item);
    }
}

void HitMap::append (CharSet_T charset) {
    HitItem item (charset);
    match_->append(item);
}

void HitMap::append (const WChar& ch) {
    HitItem item (ch.charset(), ch.charcode());
    match_->append(item);
}

void HitMap::append (const WChar& start, const WChar& end) {
    if (start.charset() != end.charset() || start.charcode() > end.charcode())
	return;

    HitItem item (start.charset(), start.charcode(), end.charcode());
    match_->append(item);
}

void HitMap::append (const HitMap& map) {
    if (default_ == map.default_) {
        for (ListItr(HitItemList) i (*(map.match_)) ; i.more(); i.next()) {
	    match_->append(i.cur());
        }
    }
}

// hit

boolean HitMap::hit (CharSet_T charset, int charcode) const {

    int count = int(match_->count());
    const HitItem *item = match_->array(0, count);

    while (count > 0) {
	if (item[count-1].between(charset, charcode)) {
	    return !default_;
	}
	count--;
    }
    return default_;
}

boolean HitMap::hit (CharSet_T charset, int begin, int end) const {
    if (hit(charset, begin) || hit(charset, end)) {
	return true;
    }

    HitItem me(charset, begin, end);
    int count = int(match_->count());
    const HitItem *item = match_->array(0, count);
    
    if (!default_) {
	while (count > 0) {
	    const HitItem & i = item[count - 1];
	    if (me.between(i.charset_, i.start_) && hit(i.charset_, i.start_))
		return true;
	    if (me.between(i.charset_, i.end_) && hit(i.charset_, i.end_))
		return true;
	    count--;
	}
	return false;
    } else {
	while (count > 0) {
	    const HitItem & i = item[count - 1];
	    if (me.between(i.charset_, i.start_) && !hit(i.charset_, i.start_))
		return true;
	    if (me.between(i.charset_, i.end_) && !hit(i.charset_, i.end_))
		return true;
	    count--;
	}
	return false;
    }
}

inline boolean HitMap::hit (CharSet_T charset) const {
    return hit(charset, 0, MaxChar);
}

boolean HitMap::hit () const {
    if (default_) return true;

    int count = int(match_->count());
    const HitItem *item = match_->array(0, count);

    while (count > 0) {
	const HitItem & i = item[count - 1];
	int j = int(match_->count());
	while (j > count) {
	    if (i.charset_ == item[j-1].charset_)
		break;
	    j--;
	}
	if (j == count && hit(i.charset_, 0, MaxChar))
	    return true;
    }
    count--;
    return false;
}

inline boolean HitMap::hit (const WChar& ch) const {
    return hit (ch.charset(), ch.charcode());
}

boolean HitMap::hit (const HitMap& he) const {
    if (default_ || he.default_) {
	return true;
    }
    int count = int(he.match_->count());
    const HitItem *item = he.match_->array(0, count);
    
    while (count > 0) {
	const HitItem & i = item[count - 1];
	if (hit(i.charset_, i.start_, i.end_)) {
	    return true;
	}
	count--;
    }
    return false;
}

/*
 * HitMapTable
 */

declareTable (HitMapTable, char, HitMap*);
implementTable (HitMapTable, char, HitMap*);

/*
 * defene for WRegexpRep;
 */

enum RE_compiled_ops /* opcodes for compiled regexp */
{
  Cend,			/* end of pattern reached */
  Cbol,			/* beginning of line */
  Ceol,			/* end of line */
  Cset,			/* character set. */
  Cexact,		/* followed by a byte to match */
  Canychar,		/* matches any character except newline */
  Ccategory,		/* matches charset */
  Cstart_memory,	/* set register start addr (followed by reg number) */
  Cend_memory,		/* set register end addr (followed by reg number) */
  Cmatch_memory,	/* match a duplicate of reg contents (regnum follows)*/
  Cjump,	/* followed by two bytes (lsb,msb) of displacement. */
  Cstar_jump,	/* will change to jump/update_failure_jump at runtime */
  Cfailure_jump,	/* jump to addr on failure */
  Cupdate_failure_jump,	/* update topmost failure point and jump */
  Cdummy_failure_jump,	/* push a dummy failure point and jump */
};

enum RE_syntax_ops	/* syntax codes for plain and quoted characters */
{
  Rend,			/* special code for end of regexp */
  Rnormal,		/* normal character */
  Ranychar,		/* any character except newline */
  Rcategory,		/* match succeeding charset */
  Rquote,		/* the quote character */
  Rbol,			/* match beginning of line */
  Reol,			/* match end of line */
  Roptional,		/* match preceding expression optionally */
  Rstar,		/* match preceding expr zero or more times */
  Rplus,		/* match preceding expr one or more times */
  Ror,			/* match either of alternatives */
  Ropenpar,		/* opening parenthesis */
  Rclosepar,		/* closing parenthesis */
  Rmemory,		/* match memory register */
  Ropenset,		/* open set.  Internal syntax hard-coded below. */
  Rnum_ops
};

enum RE_ERROR {
  RE_success,
  RE_bad_register,
  RE_bad_parenthesis,
  RE_ends_prematurely,
  RE_out_of_memory,
  RE_too_complex,
  RE_overflow,
  RE_unkown_ops,
  RE_internal,
  RE_not_found,
};
    
const int RE_NREGS	=	10;
const int RE_NSETS	=	32;
const int NUM_LEVELS	=	6;
const int MAX_NESTING	=	100;

/*
 * class WRegexpRep
 */

class WRegexpRep {
  public:
    WRegexpRep (const WString& regex);
    ~WRegexpRep ();
    
  public:
    int match(const WString& string, int pos);
    int search(const WString& string, int pos, int range);
    
  public:
    int beginning_of_match (int i) const;
    int end_of_match (int i) const;
    const char* error_message() const;

  private:
    void compile(const WString &regex);
    
    int  do_compile_fastmap(WChar *buffer, int used, int pos,
			    char& can_be_null, HitMap* fastmap);
    void compile_fastmap_aux(WChar *code, int pos, char *visited,
			     char &can_be_null, HitMap* fastmap);

  private:
    WChar *buffer_;		/* compiled pattern */
    int	  length_;		/* length of compiled pattern */
    int	  allocated_;		/* allocated length of buffer */
    HitMap *hit_[RE_NSETS];	/* HitMap array */
    int start_[RE_NREGS];	/* start offset of region */
    int end_[RE_NREGS]; 	/* end offset of region */
    int errno_;			/* error number */
    char can_be_null_;		/* true if can match empty string */
    boolean uses_registers_;	/* registers are used */

  protected:    
    static void initialize();

  public:
    static const WString& word_pattern();

  private:
    static boolean initialized_;
    static unsigned char plain_[256];
    static unsigned char quoted_[256];
    static unsigned char precedences_[Rnum_ops];
    static HitMapTable *table_;
    static WString *word_;
};

boolean WRegexpRep::initialized_;
unsigned char WRegexpRep::plain_[256];
unsigned char WRegexpRep::quoted_[256];
unsigned char WRegexpRep::precedences_[Rnum_ops];
HitMapTable *WRegexpRep::table_;
WString *WRegexpRep::word_;

WRegexpRep::WRegexpRep (const WString& regexp) {
    if (!initialized_) {
	initialize();
    }

    buffer_ = nil;
    allocated_ = 0;
    length_ = 0;
    can_be_null_ = 0;
    uses_registers_ = false;
    errno_ = RE_success;

    for (int i = 0; i < RE_NSETS; i++) {
	hit_[i] = nil;
    }
    for (i = 0; i < RE_NREGS; i++) {
	start_[i] = -1;
	end_[i] = -1;
    }
    compile (regexp);
}

WRegexpRep::~WRegexpRep () {
    if (buffer_) free ((char *)buffer_);
    for (int i = 0; i < RE_NSETS; i++) {
	delete hit_[i];
    }
}

/*
 * static functions
 */
void WRegexpRep::initialize() {
    initialized_ = true;

    /* initialize plain & quoted table */
    for (int i = 0; i < 256; i++) {
	plain_[i] = Rnormal;
	quoted_[i] = Rnormal;
    }
    
    plain_['\134'] = Rquote;
    plain_['\050'] = Ropenpar;
    plain_['\051'] = Rclosepar;
    plain_['\n'] = Ror;
    plain_['|'] = Ror;
    plain_['*'] = Rstar;
    plain_['+'] = Rplus;
    plain_['?'] = Roptional;
    plain_['\133'] = Ropenset;
    plain_['\136'] = Rbol;
    
    plain_['$'] = Reol;
    plain_['.'] = Ranychar;
    
    for (i = '0'; i <= '9'; i++) {
	quoted_[i] = Rmemory;
    }
    quoted_['c'] = Rcategory;

    /* initialize precedences table */

    for (i = 0; i < Rnum_ops; i++)
	precedences_[i] = 4;
    precedences_[Ror] = 3;
    precedences_[Rbol] = 2;
    precedences_[Reol] = 2;
    precedences_[Rclosepar] = 1;
    precedences_[Rend] = 0;

    /* initialize category/word table */

    word_ = nil;
    table_ = new HitMapTable (10);
   
    int		argc, ac;
    const int	argsize = 64;
    char	*argv[argsize], **av;
    nlsFile	file (IV_REGEXP_PATH);
    HitMap	*map = nil;
    int		word = nil;
    Locale	locale("C");
    
    while ((argc = file.getline (argv, argsize)) >= 0) {
	if (argc == 0) {
	    continue;
	} else if (!file.continued()) {
	    map = nil; word = nil;
	    if (argv[0][0] == 'C') {
	        map = new HitMap();
	        table_->insert(argv[1][0], map);
	    	ac = argc - 2, av = argv + 2;
	    } else if (argv[0][0] == 'W') {
	    	ac = argc - 1, av = argv + 1, word = 1;
	    }
	} else {
	    ac = argc, av = argv;
	}

	while (map && ac > 0) {
	    char *p = index(av[0], ':');
	    if (p != 0) { *p++ = '\0'; }
	    CharSet_T charset = CharSet::find (av[0]);
	    if (charset != -1) {
		if (p) {
		    char *q = index (p, '-');
		    if (q) {
			*q++ = '\0';
			WChar start (file.str2int(p) & 0x7f7f, charset);
			WChar end (file.str2int(q) & 0x7f7f, charset);
			map->append(start, end);
		    } else {
			WChar ch(file.str2int(p) & 0x7f7f, charset);
			map->append(ch);
		    }
		} else {
		    map->append(charset);
		}
	    }
	    ac--, av++;
	}
	if (word && ac > 0) {
	    word_ = new WString(locale, av[0]);
	}
    }
    if (!word_) {
	word_ = new WString(locale, "[a-zA-Z0-9]+");
    }
}

const WString& WRegexpRep::word_pattern () {
    if (!initialized_) {
	initialize();
    }
    return *word_;
}

/*
 * instance functions
 */
void WRegexpRep::compile(const WString &regex) {
    WChar ch, nch;
    WChar *pattern;
    int pos, op, current_level, level, opcode;
    int pattern_offset, alloc;
    int starts[NUM_LEVELS * MAX_NESTING], starts_base;
    int future_jumps[MAX_NESTING], num_jumps;
    int next_register, paren_depth, num_open_registers;
    int open_registers[RE_NREGS];
    int beginning_context;
    int size = regex.length();
    int nset = 1;

#define MACRO_BEGIN do {
			    
#define MACRO_END } while (0)

#define NEXTCHAR(var)					\
  MACRO_BEGIN						\
    if (pos >= size)					\
      goto ends_prematurely;				\
    (var) = regex[pos];					\
    pos++;						\
  MACRO_END

#define ALLOC(amount)					\
  MACRO_BEGIN						\
    if (pattern_offset+(amount) > alloc)		\
      {							\
	alloc += 256 + (amount);			\
	pattern = (WChar *) realloc((char *) pattern,	\
			  alloc * sizeof(WChar));	\
	if (!pattern)					\
	  goto out_of_memory;				\
      }							\
  MACRO_END

#define ALLOC_HIT(n,v)					\
  MACRO_BEGIN						\
    if (n < RE_NSETS) {					\
      hit_[n] = new HitMap(v);				\
    } else						\
      goto too_complex;					\
  MACRO_END

#define STORE(ch)  pattern[pattern_offset++] = (ch)

#define CURRENT_LEVEL_START				\
	(starts[starts_base + current_level])

#define SET_LEVEL_START					\
  starts[starts_base + current_level] = pattern_offset

#define PUSH_LEVEL_STARTS				\
  if (starts_base < (MAX_NESTING-1)*NUM_LEVELS)		\
      starts_base += NUM_LEVELS;			\
  else							\
      goto too_complex

#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS

#define PUT_ADDR(offset,addr)				\
  MACRO_BEGIN						\
    int disp = (addr) - (offset) - 2;			\
    pattern[(offset)] = disp;				\
    pattern[(offset)+1] = 0;				\
  MACRO_END

#define INSERT_JUMP(pos,type,addr)			\
  MACRO_BEGIN						\
    int a, p = (pos), t = (type), ad = (addr);		\
    for (a = pattern_offset - 1; a >= p; a--)		\
      pattern[a + 3] = pattern[a];			\
    pattern[p] = t;					\
    PUT_ADDR(p+1,ad);					\
    pattern_offset += 3;				\
  MACRO_END

#define SET_FIELDS					\
  MACRO_BEGIN						\
    buffer_ = pattern;					\
    allocated_ = alloc;					\
    length_ = pattern_offset;				\
  MACRO_END

#define      PLAIN_OPS(ch)				\
    ((ch.charset() == CharSet::ascii()) ? plain_[ch.charcode()] : Rnormal)
#define	     QUOTED_OPS(ch)				\
    ((ch.charset() == CharSet::ascii()) ? quoted_[ch.charcode()] : Rnormal)
	
    uses_registers_ = 0;
    pattern = buffer_;
    alloc = allocated_;

    if (alloc == 0 || pattern == NULL) {
	alloc = 256;
	pattern = (WChar *) malloc(alloc * sizeof(WChar));
	if (!pattern) goto out_of_memory;
    }
    
    pattern_offset = 0;
    starts_base = 0;
    num_jumps = 0;
    current_level = 0;
    
    SET_LEVEL_START;
    
    num_open_registers = 0;
    next_register = 1;
    paren_depth = 0;
    beginning_context = 1;
    op = -1;

    /* we use Rend dummy to ensure that pending jumps are updated
       (due to low priority of Rend) before exiting the loop. */

    pos = 0;
    while (op != Rend) {

	if (pos >= size) {
	    op = Rend;
	} else {
	    NEXTCHAR(ch);
	    op = PLAIN_OPS(ch);
	    if (op == Rquote) {
		NEXTCHAR(ch);
		op = QUOTED_OPS(ch);
	    }
	}
	
	level = precedences_[op];
	
	if (level > current_level) {
	    
	    for (current_level++; current_level < level; current_level++) {
		SET_LEVEL_START;
	    }
	    SET_LEVEL_START;
	    
	} else if (level < current_level) {
	    
	    current_level = level;
	    while (num_jumps > 0 && 
		 future_jumps[num_jumps-1] >= CURRENT_LEVEL_START) {
		PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
		num_jumps--;
	    }
	}
	
	switch (op) {
	  case Rend:
	    break;
	    
	  case Rnormal:
	  normal_char:
	    opcode = Cexact;
	    
	  store_opcode_and_arg: /* opcode & ch must be set */
	    
	    SET_LEVEL_START;
	    ALLOC(2);
	    STORE(opcode);
	    STORE(ch);
	    break;
	    
	  case Ranychar:
	    opcode = Canychar;
	    
	  store_opcode:
	    SET_LEVEL_START;
	    ALLOC(1);
	    STORE(opcode);
	    break;
	    
	  case Rquote:
	    goto too_complex;
	    /*NOTREACHED*/
	    
	  case Rbol:
	    if (!beginning_context)
		goto normal_char;
	    opcode = Cbol;
	    goto store_opcode;
	    
	case Reol:
	    if (!(pos >= size || regex[pos] == '\174' || regex[pos] == '\051'))
		goto normal_char;
	    opcode = Ceol;
	    goto store_opcode;
	    
	case Roptional:
	    if (beginning_context)
		goto normal_char;
	    if (CURRENT_LEVEL_START == pattern_offset)
		break; /* ignore empty patterns for ? */

	    ALLOC(3);
	    INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
			pattern_offset + 3);
	    break;
	    
	  case Rstar:
	  case Rplus:
	    if (beginning_context)
		goto normal_char;
	    
	    if (CURRENT_LEVEL_START == pattern_offset)
		break; /* ignore empty patterns for + and * */
	    
	    ALLOC(9);
	    INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
			pattern_offset + 6);
	    INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START);
	    if (op == Rplus)  /* jump over initial failure_jump */
		INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
			    CURRENT_LEVEL_START + 6);
	    break;
	    
	  case Ror:
	    ALLOC(6);
	    INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
			pattern_offset + 6);
	    if (num_jumps >= MAX_NESTING)
		goto too_complex;
	    STORE(Cjump);
	    future_jumps[num_jumps++] = pattern_offset;
	    STORE(0);
	    STORE(0);
	    SET_LEVEL_START;
	    break;
	    
	  case Ropenpar:
	    SET_LEVEL_START;
	    if (next_register < RE_NREGS) {
		uses_registers_ = 1;
		ALLOC(2);
		STORE(Cstart_memory);
		STORE(next_register);
		open_registers[num_open_registers++] = next_register;
		next_register++;
	    }
	    paren_depth++;
	    PUSH_LEVEL_STARTS;
	    current_level = 0;
	    SET_LEVEL_START;
	    break;
	    
	  case Rclosepar:
	    if (paren_depth <= 0)
		goto parenthesis_error;
	    POP_LEVEL_STARTS;
	    current_level = precedences_[Ropenpar];
	    paren_depth--;
	    if (paren_depth < num_open_registers) {
		uses_registers_ = 1;
		ALLOC(2);
		STORE(Cend_memory);
		num_open_registers--;
		STORE(open_registers[num_open_registers]);
	    }
	    break;
	    
	  case Rmemory:
	    if (ch == '0')
		goto bad_match_register;
	    assert(ch >= '0' && ch <= '9');
	    uses_registers_ = 1;
	    opcode = Cmatch_memory;
	    ch = ch.charcode() - '0';
	    goto store_opcode_and_arg;

	  case Rcategory:
	    SET_LEVEL_START;
	    ALLOC(2);
	    STORE(Ccategory);
	    NEXTCHAR(ch);
	    STORE(ch);
	    break;
		
	  case Ropenset:
	    {
		boolean complement = false;
		
		SET_LEVEL_START;
		ALLOC(2);
		STORE(Cset);
		STORE(nset);
		
		NEXTCHAR(ch);
		if (ch == '\136') {
		    complement = true;
		    NEXTCHAR(ch);
		}
		ALLOC_HIT(nset, complement);
		WChar prev = 0;
		boolean range = false;
		boolean firstchar = true;
		while (ch != '\135' || firstchar) {
		    firstchar = false;
		    if (range) {
			hit_[nset]->append(prev, ch);
			prev = 0;
			range = 0;
		    } else {
			if (prev != 0 && ch == '-') {
			    range = 1;
			} else {
			    hit_[nset]->append(ch);
			    prev = ch;
			}
		    }
		    NEXTCHAR(ch);
		}
		if (range) {
		    hit_[nset]->append('-');
		}
		nset++;
	    }
	    break;
	    
	  default:
	    goto too_complex;
	}
	beginning_context = (op == Ropenpar || op == Ror);
    }
    if (starts_base != 0)
	goto parenthesis_error;
    
    assert(num_jumps == 0);
    ALLOC(1);
    STORE(Cend);
    SET_FIELDS;
    errno_ = RE_success;
    return;
    
  bad_match_register:
    SET_FIELDS;
    errno_ = RE_bad_register;
    return;
    
  parenthesis_error:
    SET_FIELDS;
    errno_ = RE_bad_parenthesis;
    return;

  ends_prematurely:
    SET_FIELDS;
    errno_ = RE_ends_prematurely;
    return;
    
  out_of_memory:
    SET_FIELDS;
    errno_ = RE_out_of_memory;
    return;
    
  too_complex:
    SET_FIELDS;
    errno_ = RE_too_complex;
    return;
}

#undef NEXTCHAR
#undef STORE
#undef CURRENT_LEVEL_START
#undef SET_LEVEL_START
#undef PUSH_LEVEL_STARTS
#undef POP_LEVEL_STARTS
#undef PUT_ADDR
#undef INSERT_JUMP
#undef SET_FIELDS
#undef PLAIN_OPS
#undef QUOTED_OPS

void WRegexpRep::compile_fastmap_aux(
    WChar *code, int pos, char* visited, char& can_be_null, HitMap* hitmap
) {
  int a;

  if (visited[pos])
    return;  /* we have already been here */

  HitMap* hitmap2;
  visited[pos] = 1;

  for (;;)
    switch (code[pos++].value())
      {
      case Cend:
	can_be_null = 1;
	return;
      case Cbol:
	break;
      case Ceol:
	hitmap->append(WChar('\n'));
	if (can_be_null == 0)
	    can_be_null = 2;  /* can match null, but only at end of buffer*/
	return;
      case Cset:
	hitmap->append(*(hit_[code[pos].value()]));
	return;
      case Ccategory:
	if (table_->find(hitmap2, code[pos].value())) {
	    hitmap->append(*hitmap2);
	}
	return;
      case Cexact:
	hitmap->append(code[pos]);
	return;
      case Canychar:
	hitmap->append();
	return;
      case Cstart_memory:
      case Cend_memory:
	pos++;
	break;
      case Cmatch_memory:
	hitmap->append();
	can_be_null = 1;
	return;
      case Cjump:
      case Cdummy_failure_jump:
      case Cupdate_failure_jump:
      case Cstar_jump:

	a = code[pos++].value();
	a |= code[pos++].value() << 16;
	pos += a;

	if (visited[pos]) {
	    /*
	     * argh... the regexp contains empty loops.  This is not
	     * good, as this may cause a failure stack overflow when
	     * matching.  Oh well.
	     *
	     * this path leads nowhere; pursue other paths.
	     */
	    return;
	}
	visited[pos] = 1;
	break;
	
      case Cfailure_jump:
	a = code[pos++].value();
	a |= code[pos++].value() << 16;
	a = pos + a;
	compile_fastmap_aux(code, a, visited, can_be_null, hitmap);
	break;
      default:
	/*NOTREACHED*/
	break;
      }
}

int WRegexpRep::do_compile_fastmap(
    WChar *buffer, int len, int pos, char& can_be_null, HitMap* fastmap
) {
  char *visited = (char *)malloc(len+1);
  if (!visited) return 0;
  memset(visited, 0, len+1);

  can_be_null = 0;
  fastmap->clear();
  compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
  free(visited);
  return 1;
}

#define INITIAL_FAILURES  256  /* initial # failure points to allocate */
#define MAX_FAILURES     4100  /* max # of failure points before failing */

int WRegexpRep::match (const WString& str, int pos) {

  struct failure_point {
     const WChar *text;
     WChar *code;
  } *failure_stack_start, *failure_sp, *failure_stack_end,
    initial_failure_stack[INITIAL_FAILURES];

  WChar *code;
  const WChar *text, *textend;
  const WChar *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
  const WChar *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
  WChar ch, regch, reg;
  const WChar *regtext, *regpartend, *regtextend;
  int a, b, op, match_end;
  HitMap* map;

  if (errno_ != RE_success && errno_ != RE_not_found && errno_ != RE_overflow)
      return -2;

  errno_ = RE_success;
  const WChar* string = str.string();  
  int size = str.length();
  
#define PREFETCH					\
  MACRO_BEGIN						\
    if (text == textend)				\
      goto fail;					\
  MACRO_END

#define NEXTCHAR(var)				\
  MACRO_BEGIN					\
    PREFETCH;					\
    (var) = *text++;				\
  MACRO_END

  assert(pos >= 0 && pos <= size && size >= 0);

  text = string + pos;
  textend = string + size;

  if (uses_registers_)
    for (a = 0; a < RE_NREGS; a++)
      regend_text[a] = NULL;

  code = buffer_;
  failure_stack_start = failure_sp = initial_failure_stack;
  failure_stack_end = initial_failure_stack + INITIAL_FAILURES;

 continue_matching:

  for (;;) {
      op = (code++)->value();
      switch (op) {
	  
	case Cend:
	  match_end = text - string;
	  start_[0] = pos;
	  end_[0] = match_end;
	  if (uses_registers_) {
	      for (int i = 1; i < RE_NREGS; i++) {
		  start_[i] = -1;
		  end_[i] = -1;
	      }
	  } else {
	      for (int i = 1; i < RE_NREGS; i++) {
		  if (regend_text[i] == NULL) {
		      start_[i] = -1;
		      end_[i] = -1;
		      continue;
		  }
		  start_[i] = ((char*) regstart_text[i] - (char*) string)
				/ sizeof (WChar);
		  end_[i] = ((char*) regend_text[i] - (char*) string)
				/ sizeof (WChar);
	      }
	  }

	  if (failure_stack_start != initial_failure_stack)
	      free((char *)failure_stack_start);
	  return match_end - pos;

	case Cbol:
	  if (text == string || text[-1] == '\n') /* text[-1] always valid */
	      break;
	  goto fail;
	  
	case Ceol:
	  if (text == string + size || *text == '\n')
	      break;
	  goto fail;

	case Cset:
	  NEXTCHAR(ch);
	  if (hit_[code->value()]->hit(ch)) {
	      code++;
	      break;
	  }
	  goto fail;

	case Cexact:
	  NEXTCHAR(ch);
	  if (ch != *code++)
	      goto fail;
	  break;

	case Canychar:
	  NEXTCHAR(ch);
	  if (ch == '\n')
	      goto fail;
	  break;

	case Ccategory:
	  NEXTCHAR(ch);
	  if (table_->find(map, code->value())) {
	      if (map->hit(ch)) {
		  code++;
		  break;
	      }
	  }
	  goto fail;
	  
	case Cstart_memory:
	  reg = *code++;
	  regstart_text[reg.value()] = text;
	  break;

	case Cend_memory:
	  reg = *code++;
	  regend_text[reg.value()] = text;
	  break;

	case Cmatch_memory:
	  reg = *code++;
	  if (regend_text[reg.value()] == NULL)
	      goto fail;  /* or should we just match nothing? */
	  regtext = regstart_text[reg.value()];
	  regtextend = regend_text[reg.value()];
	  if (regstart_partend[reg.value()] == regend_partend[reg.value()])
	      regpartend = regtextend;
	  else
	      regpartend = string + size;
	  
	  for (;regtext != regtextend;) {
	      NEXTCHAR(ch);
	      regch = *regtext++;
	      if (regch != ch)
		  goto fail;
	  }
	  break;
	  
	case Cstar_jump:
	  /* star is coded as:
	       1: failure_jump 2
	          ... code for operand of star
		  star_jump 1
	       2: ... code after star
	     We change the star_jump to update_failure_jump if we can determine
	     that it is safe to do so; otherwise we change it to an ordinary
	     jump.
	     plus is coded as
	          jump 2
	       1: failure_jump 3
	       2: ... code for operand of plus
	          star_jump 1
	       3: ... code after plus
	     For star_jump considerations this is processed identically
	     to star. */
	  a = (*code++).value();
	  a |= (*code++).value() << 16;
	  {
	      const WChar *p1, *p2;
	      char can_be_null;

	      p1 = code + a + 3; /* skip the failure_jump */
	      assert(p1[-3].value() == Cfailure_jump);
	      p2 = code;

	      if (hit_[0] == nil)
		  hit_[0] = new HitMap();
	      hit_[0]->clear();
	      
	      /* p1 points inside loop, p2 points to after loop */

	      if (!do_compile_fastmap(buffer_, length_, p2 - buffer_,
				      can_be_null, hit_[0]))
		  goto make_normal_jump;
	      
	      /*
	       * If we might introduce a new update point inside the loop,
	       * we can't optimize because then update_jump would update a
	       * wrong failure point.  Thus we have to be quite careful here.
	       */

	    loop_p1:
	      
	      /* loop until we find something that consumes a character */
	      
	      switch ((*p1++).value()) {
		case Cbol:
		case Ceol:
		  goto loop_p1;
		case Cstart_memory:
		case Cend_memory:
		  p1++;
		  goto loop_p1;
	        case Cexact:
		  ch = *p1++;
		  if (hit_[0]->hit(ch))
		      goto make_normal_jump;
		  break;
		case Canychar:
		  if (hit_[0]->hit())
	  	      goto make_normal_jump;
		  break;
		case Ccategory:
		  ch = *p1++;
		  if (table_->find(map, ch.value())) {
		      if (hit_[0]->hit(*map)) {
			  goto make_normal_jump;
		      }
		  }
		  break;
		case Cset:
		  ch = *p1++;
		  if (hit_[0]->hit(*hit_[ch.value()]))
		      goto make_normal_jump;
		  break;
		default:
		  goto make_normal_jump;
	      }
	      /* now we know that we can't backtrack. */
	      while (p1 != p2 - 3) {
		  switch ((*p1++).value()) {
		    case Cend:
		      /*NOTREACHED*/
		      goto error;
		    case Cbol:
		    case Ceol:
		    case Canychar:
		      break;
		    case Ccategory:
		    case Cset:
		      p1 += 1;
		      break;
		    case Cexact:
		    case Cstart_memory:
		    case Cend_memory:
		    case Cmatch_memory:
		      p1++;
		      break;
		    case Cjump:
		    case Cstar_jump:
		    case Cfailure_jump:
		    case Cupdate_failure_jump:
		    case Cdummy_failure_jump:
		      goto make_normal_jump;
		    default:
		      /* unknown operator */
		      errno_ = RE_unkown_ops;
		      return -2;
		  }
	      }
	      goto make_update_jump;
	  }
	make_normal_jump:
	  /* printf("changing to normal jump\n"); */
	  code -= 3;
	  *code = Cjump;
	  break;
	make_update_jump:
	  /* printf("changing to update jump\n"); */
	  code -= 2;
	  a += 3;  /* jump to after the Cfailure_jump */
	  code[-1] = Cupdate_failure_jump;
	  code[0] = a;
	  code[1] = 0;
	  /* fall to next case */
	case Cupdate_failure_jump:
	  failure_sp[-1].text = text;
	  /* fall to next case */
	case Cjump:
	  a = (*code++).value();
	  a |= (*code++).value() << 16;
	  code += a;
	  break;
	  
	case Cdummy_failure_jump:
	case Cfailure_jump:
	  if (failure_sp == failure_stack_end) {
	      if (failure_stack_start != initial_failure_stack)
		  goto error;
	      failure_stack_start = (struct failure_point *)
		  malloc(MAX_FAILURES * sizeof(*failure_stack_start));
	      failure_stack_end = failure_stack_start + MAX_FAILURES;
	      memcpy((char *)failure_stack_start,
		     (char *)initial_failure_stack,
		     INITIAL_FAILURES * sizeof(*failure_stack_start));
	      failure_sp = failure_stack_start + INITIAL_FAILURES;
	  }
	  a = (*code++).value();
	  a |= (*code++).value() << 16;
	  if (code[-3] == Cdummy_failure_jump) {
	      
	      /* this is only used in plus */
	      assert(*code == Cfailure_jump);
	      b = code[1].value();
	      b |= code[2].value() << 16;
	      failure_sp->code = code + b + 3;
	      failure_sp->text = NULL;
	      code += a;
	      
	  } else {
	      failure_sp->code = code + a;
	      failure_sp->text = text;
	  }
	  failure_sp++;
	  break;
	  
	default:
	  /*NOTREACHED*/
	  goto error;
      }
  }
  /*NOTREACHED*/
  
 fail:
  if (failure_sp != failure_stack_start) {
      failure_sp--;
      text = failure_sp->text;
      if (text == NULL)
	  goto fail;
      code = failure_sp->code;
      goto continue_matching;
  }
  
  if (failure_stack_start != initial_failure_stack)
      free((char *)failure_stack_start);
  errno_ = RE_not_found;
  return -1;
  
 error:
  if (failure_stack_start != initial_failure_stack)
      free((char *)failure_stack_start);
  errno_ = RE_overflow;
  return -2;
}

int WRegexpRep::search(
    const WString& string, int pos, int range
) {
  if (errno_ != RE_success && errno_ != RE_not_found && errno_ != RE_overflow)
      return -2;

  int dir, ret;
  char anchor;
  HitMap map;

  int size = string.length();
  assert (size >= 0 && pos >= 0);
  assert (pos + range >= 0 && pos + range <= size);

  if (buffer_[0] == Cbol)
    anchor = 1;		 /* bol */
  else
    anchor = 0;		 /* none */

  if (range < 0) {
      dir = -1;
      range = -range;
  } else {
      char can_be_null;
      do_compile_fastmap(buffer_, length_, 0, can_be_null, &map);
      dir = 1;
  }
  
  for (; range > 0; range--, pos += dir) {
      if (anchor == 1) { /* anchored to begline */
	  if (pos > 0 && string[pos - 1] != '\n')
	    continue;
      }
      if (dir == 1 && !map.hit(string[pos])) {
	  continue;
      }
      assert(pos >= 0 && pos <= size);
      ret = match(string, pos);
      if (ret >= 0)
	return pos;
      if (ret == -2) {
	errno_ = RE_internal;
	return -2;
      }
  }
  errno_ = RE_not_found;
  return -1;
}

#undef PREFETCH
#undef NEXTCHAR
#undef PUSH_FAILURE

int WRegexpRep::beginning_of_match (int i) const {
    if (errno_ != RE_success)
	return -1;
    if (i < 0 || i >= RE_NREGS)
	return -1;
    return start_[i];
}

int WRegexpRep::end_of_match (int i) const {
    if (errno_ != RE_success)
	return -1;
    if (i < 0 || i >= RE_NREGS)
	return -1;
    return end_[i];
}

const char * WRegexpRep::error_message () const {
    switch (errno_) {
      case RE_success:
	return nil;
	
      case RE_bad_register:
	return "Bad match register number";

      case RE_bad_parenthesis:
	return "Badly placed parenthesis";
	
      case RE_ends_prematurely:
	return "Regular expression ends prematurely";

      case RE_too_complex:
	return "Regular expression too complex";
	
      case RE_out_of_memory:
	return "Out of memory";

      case RE_overflow:
	return "Failure stack overflow";

      case RE_unkown_ops:
	return "Unknown operator";

      case RE_not_found:
	return "Pattern not found";

      default:
	/*NOTREACHED*/
	break;
    };
    return "Unknown error";
}

/*
 * class WRegexp
 */

WRegexp::WRegexp(const char* regex) {
    pattern_ = new WString(regex);
    rep_ = new WRegexpRep(*pattern_);
}

WRegexp::WRegexp(const WString& regex) {
    pattern_ = new CopyWString(regex);
    rep_ = new WRegexpRep(*pattern_);
}

WRegexp::~WRegexp() {
    delete pattern_;
    delete rep_;
};

int WRegexp::Match(const WString& text, int index) {
    return rep_->match(text, index);
}

int WRegexp::Search(const WString& text, int index, int range) {
    return rep_->search(text, index, range);
}

int WRegexp::BeginningOfMatch(int subexp) const {
    return rep_->beginning_of_match(subexp);
}

int WRegexp::EndOfMatch(int subexp) const {
    return rep_->end_of_match(subexp);
}

const char* WRegexp::ErrorString() const {
    return rep_->error_message();
}

/* return Regular Expression for Word */

WRegexp* WRegexp::Word() {
    return new WRegexp(WRegexpRep::word_pattern());
}
