// TAKEON: Version 2. This is a rewrite of the original takeon. // If B clashes with P no warning is given other than Unused. // Convert a grammar into tables. // PENDING: 1) improve some errors, eg when '=' is missing after P<...> // 2) Currently using embedded comments containing the grammar // in the main code file. An alternative design is to embed // code in the grammar file, yacc style (with { ... } sections // immediately before an option separator ('|' or ',') or // the rule terminator (';'). Similar to my earlier 'tacc' // parser (e.g. as in https://gtoal.com/languages/algol60/algolps9.e.html ) // - the latter does simplify accessing parameters and passing // results back, though at the same time it limits flexibility. // 3) When creating the imp77 grammar I came across a problem // that was related to the presence or absence of BIPs, which // interfered with the constants for the first P<> phrase. // I need to recreate that broken grammar file and fix the bug. // 4) Currently no checking that there is a B<...> and a P<...> // of the same name. // 5) Aadd a nullable() test of grammar rules for regexps - // try it against a null string and see if it matches! // (to expand the build-time loop detection) /* An erroneous ';' is not detected. The following option should have a P<> but doesn't which *ought* to be detected but isn't. P = , ; ; */ #include #include #include #include #include #include #include #include #ifndef FALSE #define FALSE (0!=0) #endif #ifndef TRUE #define TRUE (0==0) #endif #define CHECK_FOR_LOOPS #define MAX_INCLUDES 4 // really ought to put these in a struct FILE *input_file[MAX_INCLUDES]; char *input_name[MAX_INCLUDES]; int input_offset[MAX_INCLUDES]; int input_line[MAX_INCLUDES]; int input_column[MAX_INCLUDES]; int input_last_line_last_column[MAX_INCLUDES]; int input_level = -1; static int debug_takeon = 0; //#define S(pool) ((wchar_t *)&StringPool(pool)) #include "takeon-output.c" static void takeon(FILE *f) { // State variables: static int def_phrase = FALSE; // Are we defining a P<...> phrase at the moment? // I.e. have we already seen the 'P' part of // "P<...> = " but not the '='? static int def_alts = FALSE; // Are we defining a alt at the moment? // I.e. have we already seen the '=' part of // "P<...> = "? static int def_type = 0; // The type of the current phrase definition, // i.e. P<>, C<>, O<>, or B<>. static int sym = unassigned; // Describing the structure of a phrase: static int specialbit = 0; // '?' or '!' in a term, eg or . // Only allowed in phrase names, not literals. // though really there's no good reason that we // couldn't support those too, with just a bit // of extra work and some care. static int alt_count = 1; static int alt_count_index = 0; static int phrase_count = 0; static int phrase_count_index = 0; _initcode(initcode_nextfree = 0) = '\0'; for (;;) { sym = nonspace(); switch (sym) { // almost all reads are done here (excepting calls to 'upto') // The structure of this code approximates a state machine. case 'I': // Include file. case 'O': // OPTION DEFINITION (output on the fly, no need to store in a data structure) case 'B': // BIP DEFINITION (requires matching code manually added to the parser) case 'P': // PHRASE DEFINITION (regular phrase consisting of terms and recursive phrase calls) case 'C': // CODE DEFINITION (in-line code for parse-time actions. Use very carefully) if (def_alts || def_phrase) { fprintf(stderr, "ERROR: Phrase <%ls> was not finalised before starting a new one." " Missing semi-colon!\n", String(current_def_phrasename)); exit(EXIT_FAILURE); } def_phrase = TRUE; def_alts = FALSE; def_type = sym; // We'll wait until we get the name before we do anything else. ('<' is next) break; case '<': // This occurs either at the point of a phrase definition // (P<...> =) *or* as a term within an alt (= ;). if (def_phrase) { // We're looking at the definition of a phrase: phrase_count = 0; current_def_phrasename = upto(L'>'); switch (def_type) { case 'O': // 'O<>' will be added later. break; case 'B': _bip_phrasename(current_def_bipno = NEXT_FREE_BIPNO++) = current_def_phrasename; if (NEXT_FREE_BIPNO >= NUM_BIPS) NUM_BIPS = NEXT_FREE_BIPNO; break; case 'P': _phrasename(current_def_simple_phraseno = NEXT_FREE_SIMPLE_PHRASENO++) = current_def_phrasename; _ast_code(current_def_simple_phraseno) = -1; if (NEXT_FREE_SIMPLE_PHRASENO >= NUM_SIMPLE_PHRASES) NUM_SIMPLE_PHRASES = NEXT_FREE_SIMPLE_PHRASENO; break; case 'C': _semantic_phrasename(current_def_semantic_phraseno = NEXT_FREE_SEMANTIC_PHRASENO++) = current_def_phrasename; if (NEXT_FREE_SEMANTIC_PHRASENO >= NUM_SEMANTIC_PHRASES) NUM_SEMANTIC_PHRASES = NEXT_FREE_SEMANTIC_PHRASENO; break; case 'I': if (nonspace() != ';') fatal(';'); // push context input_offset[input_level] = file_offset; input_line[input_level] = source_line; input_column[input_level] = source_column; input_last_line_last_column[input_level] = last_line_last_column; input_level += 1; if (input_level >= MAX_INCLUDES) { fprintf(stderr, "Error: I files are too deeply nested:\n"); while (--input_level >= 0) { fprintf(stderr, " %s\n", input_name[input_level]); } exit(1); } char string[1024]; int length = wcstombs(string, String(current_def_phrasename), 1023); // NARROW CONVERSION (void)length; input_name[input_level] = strdup(string); input_file[input_level] = fopen(input_name[input_level], "r"); if (input_file[input_level] == NULL) { fprintf(stderr, "Error: could not open include file: %s - %s\n", input_name[input_level], strerror(errno)); exit(1); } //fprintf(stderr, "\n ----------------------------------- INCLUDE %s ------------------------------------\n", input_name[input_level]); // reset key state variables. def_phrase = FALSE; def_alts = FALSE; def_type = 0; break; } break; // Next state is '='. Maybe create a state variable to explicitly encode that, // to make detecting errors cleaner? } else { // We're looking at an instance of a phrase within an alt: int i; phrase_count += 1; // number of phrases seen in this alt. _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES; if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count; sym = next_wide_char(); // or or just <...> ? if (sym == '?') { specialbit = GUARD_PHRASE; // parse and assign but do not step the pointer past this location (NOTE/POP-POINTER) } else if (sym == '!') { // or use '~' or '\' ??? // parse & invert. return NULL on a failed parse, don't step // the pointer, and continue parsing this alt. specialbit = NEGATED_PHRASE; } else { unget_wide_char(sym, input_file[input_level]); specialbit = 0; } current_use_phrasename = upto(L'>'); //fprintf(stderr, "{name=%ls}\n",name); // The phrase name may be a forward reference, at least during the first pass... if (pass == 0) { _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = 0; // This hole will be filled in on the next pass. break; } // Now on a pass where tables are complete... for (i = 0; i < NUM_BIPS; i++) { if (wcscmp(String(bip_phrasename(i)), String(current_use_phrasename)) == 0) { _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = i | specialbit /*GUARD_PHRASE or NEGATED_PHRASE*/ | BIP_TYPE; break; } } // MINOR BUG HERE. When adding a P<...> phrasename to the P<> table, we should also // check that a B<...> or C<...> phrase does not already exist with the same name. // And similarly when adding B<> and C<> phrase names. if (i < NUM_BIPS) break; for (i = 0; i < NUM_SIMPLE_PHRASES; i++) { if (wcscmp(String(phrasename(i)), String(current_use_phrasename)) == 0) { _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = i | specialbit /*GUARD_PHRASE or NEGATED_PHRASE*/ | PHRASE_TYPE; break; } } if (i < NUM_SIMPLE_PHRASES) break; for (i = 0; i < NUM_SEMANTIC_PHRASES; i++) { if (wcscmp(String(semantic_phrasename(i)), String(current_use_phrasename)) == 0) { _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = i | specialbit /*GUARD_PHRASE or NEGATED_PHRASE*/ | SEMANTIC_TYPE; break; } } if (i < NUM_SEMANTIC_PHRASES) break; fprintf(stderr, "* Error: P<%ls> not defined.\n", String(current_use_phrasename)); exit(EXIT_FAILURE); } // Next state is '=' break; case '=': // start of a phrase alternative or a BIP defn // current_def_phrasename should be valid at this point // 'B' and 'P' phrases get added to table. // 'C' and 'O' phrases go in their own arrays. // TO DO: check for duplicate name definition! (regardless of type, // i.e. P must clash with C etc)) // At the moment, a duplicated name is only caught when trying to compile // any C file which includes the .h file. if (!def_phrase) { fatal(0); // We have an '=' that doesn't follow any sort of phrase definition start. } switch (def_type) { case 'O': // OPTION DEFINITION (output on the fly, no need to store in a data structure) { int number; sym = nonspace(); if (!isdigit(sym)) fatal('0'); number = sym-'0'; sym = get_wide_char(input_file[input_level]); if (isdigit(sym)) { number = number*10 + (sym-'0'); } else { unget_wide_char(sym, input_file[input_level]); } if (pass == 1) { // Would be much better *inside* the #ifdef GRAMMAR_G test. // This needs to migrate to an internal array. //fprintf(stdout, "#define OPTION_%ls %d\n", // CString(current_def_phrasename), // number); // check that - -> _ conversion has been done } break; } case 'B': // BIP DEFINITION (requires matching code manually added to the parser) { // The entire O<> or B<> definition is handled here immediately. // expect a number and a ';'. // PLANNED UPGRADE: don't allow '0' as an explicit number, // and always preload B=0; because *all* parsers require a B_WEOF BIP. // Ditto for B_char, however NL and various strings etc are entirely optional // and could be built with P<> rules rather than BIPs. int c, number = nonspace(); if (!isdigit(number)) fatal('0'); number -= '0'; c = get_wide_char(input_file[input_level]); if (isdigit(c)) { number = number*10 + (c-'0'); } else { unget_wide_char(c, input_file[input_level]); } _bip_phrasename(current_def_bipno) = current_def_phrasename; _bip_map(current_def_bipno) = number; if (nonspace() != ';') fatal(';'); // def_phrase = FALSE; // Current comment is saved here for BIPS and Options. //_xcomment(this_phrase) = current_comment; //fprintf(stderr, "A: Setting xcomment(%d) = \"%ls\"\n", this_phrase, String(current_comment)); //current_comment = Str_nextfree++; _StringPool(current_comment) = '\0'; break; } case 'C': // CODE DEFINITION (in-line code for parse-time actions. Use very carefully) { // The entire C<> definition is handled here immediately. if (nonspace() != '{') { fatal('{'); // bracketed C code missing. } // _semantic_phrasename(current_def_semantic_phraseno) already filled in _semantic_code(current_def_semantic_phraseno) = c_code_block(); if (nonspace() != ';') fatal(';'); // def_phrase = FALSE; // No comments saved for C<> definition blocks. //current_comment = Str_nextfree++; _StringPool(current_comment) = '\0'; break; } case 'P': // PHRASE DEFINITION (regular phrase consisting of terms and recursive phrase calls) { // P<...> = ... ; _sequential_phrase_no_to_grammar_index(current_def_simple_phraseno) = NEXT_FREE_GRAMMAR_SLOTNO; // _phrasename(current_def_simple_phraseno) = current_def_phrasename; already assigned _grammar_index_to_sequential_phrase_number(NEXT_FREE_GRAMMAR_SLOTNO) = current_def_simple_phraseno; def_phrase = FALSE; def_alts = TRUE; alt_count = 1; alt_count_index = NEXT_FREE_GRAMMAR_SLOTNO++; // Hole for number of alternatives _gram(alt_count_index) = alt_count | COUNT_OF_ALTS; //phrase_count = 0; phrase_count_index = NEXT_FREE_GRAMMAR_SLOTNO++; // hole for number of items in first alt. _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES; //if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count; // Next state is a term of some description. break; } } // end of the inner switch break; // from the outer switch, back to the main loop. case '\'': // string literal currently implemented as a sequence of single chars if (def_phrase) { fprintf(stderr, "Line %d: P'...' is not allowed. 'Literals' are" " only allowed on the RHS of the '='.\n", source_line); exit(EXIT_FAILURE); } else { STRINGINDEX string = upto(L'\''); // TO DO: a string literal or a keyword literal can contain // C-style '\' escapes such as \n or \u2345 // Should we allow whitespace only before the string or also between each character? // - depends on whether it is a keyword and what kind of stropping. { wchar_t *s = String(string); while (*s != '\0') { if ((((*s)&~255) != 0) && (((*s)&~255) != 0xFFFFFF00)) { // TO DO: UPGRADE THIS CONSTRUCT TO USE STRING_TYPE RATHER THAN CHAR_TYPE. THEN DELETE CHAR_TYPE. // phrase_count += wcslen(String(string)); // Each char counts as a phrase, albeit a short one... Moved to loop. fprintf(stderr, "Warning: Unicode char constant truncated: %lc\n", (wint_t)(*s)); } _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = CHAR_TYPE | WHITESPACE_ALLOWED | ((*s++)&255); // 8-bit only - hence why deprecating phrase_count += 1; } } _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES; if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count; break; } case '"': // keyword literal if (def_phrase) { fprintf(stderr, "Line %d: P\"...\" is not allowed. \"Keywords\" are only allowed on the RHS of the '='.\n", source_line); exit(EXIT_FAILURE); } else { STRINGINDEX keyword = upto(L'"'); phrase_count += 1; _gram(phrase_count_index) = phrase_count; if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count; // TO DO: a string literal or a keyword literal can contain C-style '\' escapes such as \n or \u2345 _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = WHITESPACE_ALLOWED | keyword_code(keyword) /* TYPE already inserted by *_code() */; break; } case L'«': // regexp literal if (def_phrase) { fprintf(stderr, "Line %d: P«...» is not allowed. Regular «expressions» are" " only allowed on the RHS of the '='\n", source_line); exit(EXIT_FAILURE); } else { // regexp literal STRINGINDEX regex, anchored_regexp = Str_nextfree++; _StringPool(anchored_regexp) = '^'; regex = upto(L'»'); // a little hacky. Relies on storing at Str_nextfree. assert(regex == anchored_regexp+1); phrase_count += 1; _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES; if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count; _gram(NEXT_FREE_GRAMMAR_SLOTNO++) = regexp_code(anchored_regexp) | WHITESPACE_ALLOWED; } break; case '|': // next alternative case ',': // next alternative // previous phrase: _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES; // should have already been done if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count; // Next alt: alt_count += 1; _gram(alt_count_index) = alt_count | COUNT_OF_ALTS; phrase_count = 0; phrase_count_index = NEXT_FREE_GRAMMAR_SLOTNO++; _gram(phrase_count_index) = phrase_count; break; case '{': // Global Declarations, copied over to the .h file. { if (!def_alts) { // There is only one anonymous initcode block. Multiple {} sections are // simply appended to it. // TO DO: I believe there is currently a problem if there is an unbalanced single or // double quote in a comment within a C code block in a .g file. Not sure if it is // this block or the one in takeon-output.c. It may be that the problem is in the /* ... */ style of comment. int level = 0; // WE NEED A PROPER C LEXER HERE OTHERWISE MESSED UP BY '{'s IN COMMENTS! for (;;) { sym = next_wide_char(); //#ifdef NEVER // not yet used. C<...> = { ... }; blocks are implemented in takeon-output.c and if (sym == '/') { // should be very similar to the code here. _initcode(initcode_nextfree++) = sym; sym = next_wide_char(); if (sym == '/') { // comment for (;;) { _initcode(initcode_nextfree++) = sym; sym = next_wide_char(); if (sym == '\n') break; } } // Also needs to handle ' and " strings outside comments. // (don't even think about /* ... */ style comments!) } //#endif if (sym == '{') level += 1; if ((sym == '}') && (level == 0)) break; if (sym == '}') level -= 1; _initcode(initcode_nextfree++) = sym; } _initcode(initcode_nextfree) = '\0'; break; } else { if (def_type != 'P') { fatal('{'); } // C code to be executed when this phrase is recognised. // At this point in development, just skip the code block // and worry about adding it to the data structure later. // '=' is next. _ast_code(current_def_simple_phraseno) = c_code_block(); if (nonspace() != ';') fatal(';'); // } } // drop through: case ';': // end of alternatives. No warning for not being in a definition? // (redundant ';'s between rules appear to be accepted) Use def_alts to fix? // def_type must be 'P'; 'C', 'O' and 'B' were handled earlier. //_xcomment(this_phrase) = current_comment; //fprintf(stderr, "B: Setting xcomment(%d) = \"%ls\"\n", this_phrase, String(current_comment)); //current_comment = Str_nextfree++; _StringPool(current_comment) = '\0'; // We could tell whether the last alt was null by // checking lastsym == ',', should we ever need to know // Don't need to assign the other ones on the fly as long as we assign here. _gram(alt_count_index) = alt_count | COUNT_OF_ALTS; _gram(phrase_count_index) = phrase_count | COUNT_OF_PHRASES; //if (phrase_count > LARGEST_ALT) LARGEST_ALT = phrase_count; alt_count = 0; phrase_count = 0; def_phrase = FALSE; def_alts = FALSE; current_def_phrasename = unassigned; if (NEXT_FREE_GRAMMAR_SLOTNO >= NUM_GRAMMAR) NUM_GRAMMAR = NEXT_FREE_GRAMMAR_SLOTNO; break; case 'E': // END OF GRAMMAR return; case '#': // comment to end of line. Do these only work between phrases? (No) { // We need to modify nonspace() to handle these too. Or instead. //current_comment = Str_nextfree; // comment handling is currently disabled. for (;;) { sym = next_wide_char(); _StringPool(Str_nextfree++) = sym; if (sym == '\n') break; } _StringPool(Str_nextfree++) = '\0'; } break; default: fprintf(stderr, "* Error: unexpected character '%lc' in %s\n", sym, input_name[input_level]); fatal(0); } // end of the main switch in the for loop } // end of the for loop reading the grammar file } int main(int argc, char **argv) { /* char *locale = */ (void) setlocale(LC_ALL, ""); FILE *sink = fopen("/dev/null", "w"); debug_flex = 0; if ((sizeof(wchar_t) < 4) || (sizeof(wint_t) < 4)) { fprintf(stderr, "takeon: wchar_t or wint_t size is %d bytes " "- please recompile with a real compiler.\n", MinINT(sizeof(wchar_t),sizeof(wint_t))); exit(EXIT_FAILURE); } //if ((argc >= 2) && (strcmp(argv[1], "-v")==0)) { // argc -= 1; argv += 1; verbose = TRUE; //} if ((argc >= 2) && (strcmp(argv[1], "-d")==0)) { argc -= 1; argv += 1; debug_takeon = TRUE; } if (argc != 2) { fprintf(stderr, "syntax: takeon [-d] compiler.g\n"); exit(EXIT_FAILURE); } // Multi-pass in order to determine how many phrases occur // in each phrase definition. Should just be 2-pass, last // effort was 3. for (pass = 0; pass <= 1; pass++) { FILE *grammar; // The input file. char *grammarfn = NULL; // Also source_line, source_column grammar = fopen(grammarfn=argv[1], "r"); if (grammar == NULL) { int err = errno; fprintf(stderr, "takeon: %s - %s\n", strerror(err), argv[1]); exit(err); // caller may need to use WEXITSTATUS to get this value. } input_level = 0; input_file[input_level] = grammar; input_name[input_level] = grammarfn; // We make critical operations idempotent so that // executing them twice won't have any ill effects, // but some variables do have to be reset for each pass. NEXT_FREE_BIPNO = 0; NEXT_FREE_SIMPLE_PHRASENO = 0; NEXT_FREE_SEMANTIC_PHRASENO = 0; NEXT_FREE_GRAMMAR_SLOTNO = 0; // We don't reset NUM_* - preserve completed tables between passes. LARGEST_ALT = 0; BIP_BASE = 0; PHRASE_BASE = 0; SEMANTIC_BASE = 0; AST_BASE = 0; takeon(stderr); // forward references only work on second pass. if (input_level != 0) { fprintf(stderr, "Error: input_level = %d at end of grammar.\n", input_level); } fclose(input_file[input_level]); grammar = NULL; // easier to close and reopen than to rewind. } dump_tables(stdout/*sink*/); fflush(stdout); #ifdef CHECK_FOR_LOOPS // This code is entirely optional. The converter will work without it. #include "takeon-checks.c" check_all_phrases(); // Report any detectable issues in the structure of the grammar. #endif fclose(sink); exit(EXIT_SUCCESS); return(EXIT_FAILURE); } // Will add error checks later.