// TAKEON: Version 2.  This is a rewrite of the original takeon.

// If B<name> clashes with P<name> no warning is given other than <name> Unused.

// Convert a grammar into tables.

// PENDING: 1) improve some errors, eg when '=' is missing after P<...>
//          2) Currently using embedded comments containing the grammar
//             in the main code file.  An alternative design is to embed
//             code in the grammar file, yacc style (with { ... } sections
//             immediately before an option separator ('|' or ',') or
//             the rule terminator (';').  Similar to my earlier 'tacc'
//             parser (e.g. as in https://gtoal.com/languages/algol60/algolps9.e.html )
//             - the latter does simplify accessing parameters and passing
//             results back, though at the same time it limits flexibility.
//          3) When creating the imp77 grammar I came across a problem
//             that was related to the presence or absence of BIPs, which
//             interfered with the constants for the first P<> phrase.
//             I need to recreate that broken grammar file and fix the bug.
//          4) Currently no checking that there is a B<...> and a P<...>
//             of the same name.
//          5) Aadd a nullable() test of grammar rules for regexps -
//             try it against a null string and see if it matches!
//             (to expand the build-time loop detection)

/*

An erroneous ';' is not detected.  The following option should have a P<> but doesn't which *ought* to be detected but isn't.

P<Scalar_Declaration> =
                      <External_Scalar_Declaration>,
                      <OWN_or_CONST_Scalar_Declaration> ;
                      <AUTO_Scalar_Declaration> ;


 */

#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <stdlib.h>
#include <ctype.h>
#include <assert.h>

#include <wchar.h>
#include <locale.h>

#ifndef FALSE
#define FALSE (0!=0)
#endif
#ifndef TRUE
#define TRUE (0==0)
#endif

#define CHECK_FOR_LOOPS

#define MAX_INCLUDES 4

// really ought to put these in a struct
FILE *input_file[MAX_INCLUDES];
char *input_name[MAX_INCLUDES];
int   input_offset[MAX_INCLUDES];
int   input_line[MAX_INCLUDES];
int   input_column[MAX_INCLUDES];
int   input_last_line_last_column[MAX_INCLUDES];
int   input_level = -1;

static int debug_takeon = 0;
//#define S(pool) ((wchar_t *)&StringPool(pool))
#include "takeon-output.c"

static void takeon(FILE *f) {

  // State variables:
  static int def_phrase = FALSE;    // Are we defining a P<...> phrase at the moment?
                                    // I.e. have we already seen the 'P' part of
                                    // "P<...> = " but not the '='?
  static int def_alts = FALSE;      // Are we defining a alt at the moment?
                                    // I.e. have we already seen the '=' part of
                                    // "P<...> = "?
  static int def_type = 0;          // The type of the current phrase definition,
                                    // i.e. P<>, C<>, O<>, or B<>.
  
  static int sym = unassigned;

  // Describing the structure of a phrase:
  static int specialbit = 0; // '?' or '!' in a term, eg <?guard> or <!guard>.
                             // Only allowed in phrase names, not literals.
                             // though really there's no good reason that we
                             // couldn't support those too, with just a bit
                             // of extra work and some care.
  static int alt_count = 1;
  static int alt_count_index = 0;

  static int phrase_count = 0;
  static int phrase_count_index = 0;

  _initcode(initcode_nextfree = 0) = '\0';
  
  for (;;) {
    sym = nonspace();

    switch (sym) {  // almost all reads are done here (excepting calls to 'upto')
                    // The structure of this code approximates a state machine.

    case 'I': // Include file.
    case 'O': // OPTION DEFINITION (output on the fly, no need to store in a data structure)
    case 'B': // BIP DEFINITION    (requires matching code manually added to the parser)
    case 'P': // PHRASE DEFINITION (regular phrase consisting of terms and recursive phrase calls)
    case 'C': // CODE DEFINITION   (in-line code for parse-time actions. Use very carefully)

      if (def_alts || def_phrase) {
        fprintf(stderr,
                "ERROR: Phrase <%ls> was not finalised before starting a new one."
                "  Missing semi-colon!\n", String(current_def_phrasename));
        exit(EXIT_FAILURE);
      }
      
      def_phrase = TRUE; def_alts = FALSE; def_type = sym;

      // We'll wait until we get the name before we do anything else. ('<' is next)

      break;

      
    case '<': // This occurs either at the point of a phrase definition
              // (P<...> =) *or* as a term within an alt (= <term1> <term2>;).
      if (def_phrase) {
        // We're looking at the definition of a phrase:
        phrase_count = 0;
        current_def_phrasename = upto(L'>');
        switch (def_type) {
        case 'O': // 'O<>' will be added later.
                  break;
        case 'B': _bip_phrasename(current_def_bipno = NEXT_FREE_BIPNO++) = current_def_phrasename;
                  if (NEXT_FREE_BIPNO >= NUM_BIPS) NUM_BIPS = NEXT_FREE_BIPNO;
                  break;
        case 'P': _phrasename(current_def_simple_phraseno = NEXT_FREE_SIMPLE_PHRASENO++) = current_def_phrasename;
                  _ast_code(current_def_simple_phraseno) = -1;
                  if (NEXT_FREE_SIMPLE_PHRASENO >= NUM_SIMPLE_PHRASES) NUM_SIMPLE_PHRASES = NEXT_FREE_SIMPLE_PHRASENO;
                  break;
        case 'C': _semantic_phrasename(current_def_semantic_phraseno = NEXT_FREE_SEMANTIC_PHRASENO++) = current_def_phrasename;
                  if (NEXT_FREE_SEMANTIC_PHRASENO >= NUM_SEMANTIC_PHRASES) NUM_SEMANTIC_PHRASES = NEXT_FREE_SEMANTIC_PHRASENO;
                  break;
        case 'I': if (nonspace() != ';') fatal(';');

                 // push context
           
                  input_offset[input_level] = file_offset;
                  input_line[input_level] = source_line;
                  input_column[input_level] = source_column;
                  input_last_line_last_column[input_level] = last_line_last_column;

                  input_level += 1;
                  if (input_level >= MAX_INCLUDES) {
                    fprintf(stderr, "Error: I<include> files are too deeply nested:\n");
                    while (--input_level >= 0) {
                      fprintf(stderr, "       %s\n", input_name[input_level]);
                    }
                    exit(1);
                  }
                  char string[1024];
                  int length = wcstombs(string, String(current_def_phrasename), 1023); // NARROW CONVERSION
                  (void)length;
                  input_name[input_level] = strdup(string);
                  input_file[input_level] = fopen(input_name[input_level], "r");
                  if (input_file[input_level] == NULL) {
                    fprintf(stderr, "Error: could not open include file: %s - %s\n", input_name[input_level], strerror(errno));
                    exit(1);
                  }
                  //fprintf(stderr, "\n ----------------------------------- INCLUDE %s ------------------------------------\n", input_name[input_level]);
                  // reset key state variables.
                  def_phrase = FALSE;
                  def_alts = FALSE;
                  def_type = 0;
                  break;
        }
        break;
        // Next state is '='.  Maybe create a state variable to explicitly encode that,
        // to make detecting errors cleaner?
      } else {
        // We're looking at an instance of a phrase within an alt:
        int i;
        
        phrase_count += 1; // number of phrases seen in this alt.
        _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES;
        if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count;

        sym = next_wide_char();  // <?...> or <!...> or just <...> ?
        if (sym == '?') {
          specialbit = GUARD_PHRASE;
          // parse and assign but do not step the pointer past this location (NOTE/POP-POINTER)
	} else if (sym == '!') { // or use '~' or '\' ???
          // parse & invert.  return NULL on a failed parse, don't step
          // the pointer, and continue parsing this alt.
          specialbit = NEGATED_PHRASE;
	} else {
	  unget_wide_char(sym, input_file[input_level]);
          specialbit = 0;
	}

        current_use_phrasename = upto(L'>'); //fprintf(stderr, "{name=%ls}\n",name);
        // The phrase name may be a forward reference, at least during the first pass...
        if (pass == 0) {
          _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = 0; // This hole will be filled in on the next pass.
          break;
        }

        // Now on a pass where tables are complete...
        for (i = 0; i < NUM_BIPS; i++) {
          if (wcscmp(String(bip_phrasename(i)), String(current_use_phrasename)) == 0) {
            _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = i
                                              | specialbit /*GUARD_PHRASE or NEGATED_PHRASE*/
                                              | BIP_TYPE;
            break;
          }
        }
        // MINOR BUG HERE. When adding a P<...> phrasename to the P<> table, we should also
        // check that a B<...> or C<...> phrase does not already exist with the same name.
        // And similarly when adding B<> and C<> phrase names.
        if (i < NUM_BIPS) break;
        
        for (i = 0; i < NUM_SIMPLE_PHRASES; i++) {
          if (wcscmp(String(phrasename(i)), String(current_use_phrasename)) == 0) {
            _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = i
                                              | specialbit /*GUARD_PHRASE or NEGATED_PHRASE*/
                                              | PHRASE_TYPE;
            break;
          }
        }
        if (i < NUM_SIMPLE_PHRASES) break;
        
        for (i = 0; i < NUM_SEMANTIC_PHRASES; i++) {
          if (wcscmp(String(semantic_phrasename(i)), String(current_use_phrasename)) == 0) {
            _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = i
                                              | specialbit /*GUARD_PHRASE or NEGATED_PHRASE*/
                                              | SEMANTIC_TYPE;
            break;
          }
        }
        if (i < NUM_SEMANTIC_PHRASES) break;

        fprintf(stderr, "* Error: P<%ls> not defined.\n", String(current_use_phrasename));
        exit(EXIT_FAILURE);
        
      }
      // Next state is '='
      break;

    case '=': // start of a phrase alternative or a BIP defn
      // current_def_phrasename should be valid at this point
      // 'B' and 'P' phrases get added to table.
      // 'C' and 'O' phrases go in their own arrays.
      
      // TO DO: check for duplicate name definition!  (regardless of type,
      // i.e. P<fred> must clash with C<fred> etc))
      // At the moment, a duplicated name is only caught when trying to compile
      // any C file which includes the .h file.

      if (!def_phrase) {
        fatal(0); // We have an '=' that doesn't follow any sort of phrase definition start.
      }
      switch (def_type) {

      case 'O': // OPTION DEFINITION (output on the fly, no need to store in a data structure)
        {
          int number;
          sym = nonspace();
          if (!isdigit(sym)) fatal('0');
          number = sym-'0';
          sym = get_wide_char(input_file[input_level]);
          if (isdigit(sym)) {
            number = number*10 + (sym-'0');
          } else {
            unget_wide_char(sym, input_file[input_level]);
          }
          if (pass == 1) { // Would be much better *inside* the #ifdef GRAMMAR_G test.
            // This needs to migrate to an internal array.
            //fprintf(stdout, "#define OPTION_%ls %d\n",
            //                 CString(current_def_phrasename),
            //                 number); // check that - -> _ conversion has been done
          }
          break;
        }
        
      case 'B': // BIP DEFINITION    (requires matching code manually added to the parser)
        {
          // The entire O<> or B<> definition is handled here immediately.

          // expect a number and a ';'.
          // PLANNED UPGRADE: don't allow '0' as an explicit number,
          // and always preload B<WEOF>=0; because *all* parsers require a B_WEOF BIP.
          // Ditto for B_char, however NL and various strings etc are entirely optional
          // and could be built with P<> rules rather than BIPs.
          int c, number = nonspace();
          if (!isdigit(number)) fatal('0');
          number -= '0';
          c = get_wide_char(input_file[input_level]);
          if (isdigit(c)) {
            number = number*10 + (c-'0');
          } else {
            unget_wide_char(c, input_file[input_level]);
          }
  
          _bip_phrasename(current_def_bipno) = current_def_phrasename;
          _bip_map(current_def_bipno) = number;
          if (nonspace() != ';') fatal(';'); //
          def_phrase = FALSE;
  
          // Current comment is saved here for BIPS and Options.
          //_xcomment(this_phrase) = current_comment;
          //fprintf(stderr, "A: Setting xcomment(%d) = \"%ls\"\n", this_phrase, String(current_comment));
          //current_comment = Str_nextfree++; _StringPool(current_comment) = '\0';
          break;    
        }

      case 'C': // CODE DEFINITION   (in-line code for parse-time actions. Use very carefully)
        {

          // The entire C<> definition is handled here immediately.
          if (nonspace() != '{') {
            fatal('{'); // bracketed C code missing.
          }

          // _semantic_phrasename(current_def_semantic_phraseno) already filled in
          _semantic_code(current_def_semantic_phraseno) = c_code_block();

          if (nonspace() != ';') fatal(';'); //
          def_phrase = FALSE;

          // No comments saved for C<> definition blocks.
          //current_comment = Str_nextfree++; _StringPool(current_comment) = '\0';
          break;  
        }

      case 'P': // PHRASE DEFINITION (regular phrase consisting of terms and recursive phrase calls)
        { //  P<...> = ... ;

          _sequential_phrase_no_to_grammar_index(current_def_simple_phraseno) = NEXT_FREE_GRAMMAR_SLOTNO;  
          // _phrasename(current_def_simple_phraseno) = current_def_phrasename; already assigned
          _grammar_index_to_sequential_phrase_number(NEXT_FREE_GRAMMAR_SLOTNO) = current_def_simple_phraseno;

          def_phrase = FALSE; def_alts = TRUE;

          alt_count = 1;
          alt_count_index = NEXT_FREE_GRAMMAR_SLOTNO++;    // Hole for number of alternatives
          _gram(alt_count_index) = alt_count | COUNT_OF_ALTS;

          //phrase_count = 0;
          phrase_count_index = NEXT_FREE_GRAMMAR_SLOTNO++; // hole for number of items in first alt.
          _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES;          
          //if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count;
          
          // Next state is a term of some description.
          break;  
        }
        
      } // end of the inner switch
      break; // from the outer switch, back to the main loop.

    case '\'': // string literal currently implemented as a sequence of single chars
      if (def_phrase) {
        fprintf(stderr, "Line %d: P'...' is not allowed.  'Literals' are"
                " only allowed on the RHS of the '='.\n", source_line);
        exit(EXIT_FAILURE);
      } else {
        STRINGINDEX string = upto(L'\'');

        // TO DO: a string literal or a keyword literal can contain
        // C-style '\' escapes such as \n or \u2345

        // Should we allow whitespace only before the string or also between each character?
        // - depends on whether it is a keyword and what kind of stropping.
      
        {
          wchar_t *s = String(string);
          while (*s != '\0') {
            if ((((*s)&~255) != 0) && (((*s)&~255) != 0xFFFFFF00)) {
              // TO DO: UPGRADE THIS CONSTRUCT TO USE STRING_TYPE RATHER THAN CHAR_TYPE.  THEN DELETE CHAR_TYPE.
              // phrase_count += wcslen(String(string)); // Each char counts as a phrase, albeit a short one... Moved to loop.
              fprintf(stderr, "Warning: Unicode char constant truncated: %lc\n", (wint_t)(*s));
            }
            _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = CHAR_TYPE | WHITESPACE_ALLOWED | ((*s++)&255); // 8-bit only - hence why deprecating
            phrase_count += 1;
          }
        }
        _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES;
        if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count;
        break;
      }
      
    case '"': // keyword literal
      if (def_phrase) {
        fprintf(stderr, "Line %d: P\"...\" is not allowed.  \"Keywords\" are only allowed on the RHS of the '='.\n", source_line);
        exit(EXIT_FAILURE);
      } else {
        STRINGINDEX keyword = upto(L'"');

        phrase_count += 1;
        _gram(phrase_count_index) = phrase_count;
        if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count;
        
        // TO DO: a string literal or a keyword literal can contain C-style '\' escapes such as \n or \u2345
        _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = WHITESPACE_ALLOWED | keyword_code(keyword) /* TYPE already inserted by *_code() */;

        break;
      }
      
    case L'«': // regexp literal
      if (def_phrase) {
        fprintf(stderr, "Line %d: P«...» is not allowed.  Regular «expressions» are"
                        " only allowed on the RHS of the '='\n", source_line);
        exit(EXIT_FAILURE);
      } else {
        // regexp literal
        STRINGINDEX regex, anchored_regexp = Str_nextfree++;
        _StringPool(anchored_regexp) = '^';
        regex = upto(L'»'); // a little hacky. Relies on storing at Str_nextfree.
        assert(regex == anchored_regexp+1);
        
        phrase_count += 1;
        _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES;
        if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count;
        
        _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = regexp_code(anchored_regexp) | WHITESPACE_ALLOWED;
      }
      break;
      
    case '|': // next alternative
    case ',': // next alternative
      // previous phrase:
      _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES; // should have already been done
      if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count;

      // Next alt:
      alt_count += 1;
      _gram(alt_count_index) = alt_count | COUNT_OF_ALTS;

      phrase_count = 0;
      phrase_count_index = NEXT_FREE_GRAMMAR_SLOTNO++;
      _gram(phrase_count_index) = phrase_count;

      break;

    case '{': // Global Declarations, copied over to the .h file.
      {

        if (!def_alts) {

          // There is only one anonymous initcode block.  Multiple {} sections are
          // simply appended to it.

          // TO DO: I believe there is currently a problem if there is an unbalanced single or
          // double quote in a comment within a C code block in a .g file.  Not sure if it is
          // this block or the one in takeon-output.c.  It may be that the problem is in the /* ... */ style of comment.
          
          int level = 0;
          // WE NEED A PROPER C LEXER HERE OTHERWISE MESSED UP BY '{'s IN COMMENTS!
          for (;;) {
            sym = next_wide_char();
//#ifdef NEVER  // not yet used.  C<...> = { ... }; blocks are implemented in takeon-output.c and
            if (sym == '/') {                   //  should be very similar to the code here.
              _initcode(initcode_nextfree++) = sym;
              sym = next_wide_char();
              if (sym == '/') {
                // comment
                for (;;) {
                  _initcode(initcode_nextfree++) = sym;
                  sym = next_wide_char();
                  if (sym == '\n') break;
                }
              }
              // Also needs to handle ' and " strings outside comments.
              // (don't even think about /* ... */ style comments!)
            }
//#endif
            if (sym == '{') level += 1;
            if ((sym == '}') && (level == 0)) break;
            if (sym == '}') level -= 1;
            _initcode(initcode_nextfree++) = sym;
          }
          _initcode(initcode_nextfree) = '\0';
          break;
          
        } else {
          
          if (def_type != 'P') {
            fatal('{');
          }
          // C code to be executed when this phrase is recognised.
          // At this point in development, just skip the code block
          // and worry about adding it to the data structure later.
          // '=' is next.
          _ast_code(current_def_simple_phraseno) = c_code_block();

          if (nonspace() != ';') fatal(';'); //
        }

      }
      // drop through:
      
    case ';': // end of alternatives.   No warning for not being in a definition?
              // (redundant ';'s between rules appear to be accepted) Use def_alts to fix?

      // def_type must be 'P'; 'C', 'O' and 'B' were handled earlier.
      
      //_xcomment(this_phrase) = current_comment;
      //fprintf(stderr, "B: Setting xcomment(%d) = \"%ls\"\n", this_phrase, String(current_comment));
      //current_comment = Str_nextfree++; _StringPool(current_comment) = '\0';

      // We could tell whether the last alt was null by
      // checking lastsym == ',', should we ever need to know

      // Don't need to assign the other ones on the fly as long as we assign here.
      _gram(alt_count_index)    = alt_count    | COUNT_OF_ALTS;
      _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES;
      
      //if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count;
      alt_count = 0; phrase_count = 0;
      def_phrase = FALSE; def_alts = FALSE;
      current_def_phrasename = unassigned;

      if (NEXT_FREE_GRAMMAR_SLOTNO >= NUM_GRAMMAR) NUM_GRAMMAR = NEXT_FREE_GRAMMAR_SLOTNO;
        
      break;

    case 'E': // END OF GRAMMAR
      return;

    case '#': //  comment to end of line.  Do these only work between phrases? (No)
      {       //  We need to modify nonspace() to handle these too. Or instead.
        //current_comment = Str_nextfree;   // comment handling is currently disabled.
        for (;;) {
          sym = next_wide_char();
          _StringPool(Str_nextfree++) = sym;
          if (sym == '\n') break;
        }
        _StringPool(Str_nextfree++) = '\0';
      }
      break;

    default:
      fprintf(stderr, "* Error: unexpected character '%lc' in %s\n", sym, input_name[input_level]);
      fatal(0);
      
    } // end of the main switch in the for loop

  } // end of the for loop reading the grammar file


}

int main(int argc, char **argv) {
  /* char *locale = */ (void) setlocale(LC_ALL, "");
  FILE *sink = fopen("/dev/null", "w");

  debug_flex = 0;

  if ((sizeof(wchar_t) < 4) || (sizeof(wint_t) < 4)) {
    fprintf(stderr, "takeon: wchar_t or wint_t size is %d bytes "
                    "- please recompile with a real compiler.\n",
            MinINT(sizeof(wchar_t),sizeof(wint_t)));
    exit(EXIT_FAILURE);
  }
  
  //if ((argc >= 2) && (strcmp(argv[1], "-v")==0)) {
  //  argc -= 1; argv += 1; verbose = TRUE;
  //}

  if ((argc >= 2) && (strcmp(argv[1], "-d")==0)) {
    argc -= 1; argv += 1; debug_takeon = TRUE;
  }

  if (argc != 2) {
    fprintf(stderr, "syntax: takeon [-d] compiler.g\n");
    exit(EXIT_FAILURE);
  }

  // Multi-pass in order to determine how many phrases occur
  // in each phrase definition. Should just be 2-pass, last
  // effort was 3.
  for (pass = 0; pass <= 1; pass++) {

    FILE *grammar; // The input file.
    char *grammarfn = NULL;
    // Also source_line, source_column

    grammar = fopen(grammarfn=argv[1], "r");
    if (grammar == NULL) {
      int err = errno;
      fprintf(stderr, "takeon: %s - %s\n", strerror(err), argv[1]);
      exit(err); // caller may need to use WEXITSTATUS to get this value.
    }

    input_level = 0;
    input_file[input_level] = grammar;
    input_name[input_level] = grammarfn;
    
    // We make critical operations idempotent so that
    // executing them twice won't have any ill effects,
    // but some variables do have to be reset for each pass.
    
    NEXT_FREE_BIPNO = 0;
    NEXT_FREE_SIMPLE_PHRASENO = 0;
    NEXT_FREE_SEMANTIC_PHRASENO = 0;

    NEXT_FREE_GRAMMAR_SLOTNO = 0;
    
    // We don't reset NUM_* - preserve completed tables between passes.
    
    LARGEST_ALT = 0;
    BIP_BASE = 0;
    PHRASE_BASE = 0;
    SEMANTIC_BASE = 0;
    AST_BASE = 0;
    
    takeon(stderr); // forward references only work on second pass.

    if (input_level != 0) {
      fprintf(stderr, "Error: input_level = %d at end of grammar.\n", input_level);
    }
    fclose(input_file[input_level]); grammar = NULL; // easier to close and reopen than to rewind.
  }

  dump_tables(stdout/*sink*/); fflush(stdout);
   
#ifdef CHECK_FOR_LOOPS  // This code is entirely optional.  The converter will work without it.
  #include "takeon-checks.c"
    
  check_all_phrases();  // Report any detectable issues in the structure of the grammar.

#endif

  fclose(sink);
  exit(EXIT_SUCCESS); return(EXIT_FAILURE);
}

// Will add error checks later.
