#!/usr/bin/perl
## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements.  See the NOTICE file
## distributed with this work for additional information
## regarding copyright ownership.  The ASF licenses this file
## to you under the Apache License, Version 2.0 (the
## "License"); you may not use this file except in compliance
## with the License.  You may obtain a copy of the License at
##
##     http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.

## TODO (2024)
## [ ] Why does ruleBodyStr have a newline?

## Consider rewriting, preserving the inlining.
## Parse lines (and continuation lines). Fixup.

# Grammar into BNF text
# Reads in sparql.txt and the tokens.txt file

if ( $#ARGV != 1 )
{
    print STDERR "Usage: grammar.txt tokens.txt\n" ;
    exit 1 ;
}

$/ = undef ;
# Just table or full page.

$grammarFile = $ARGV[0] ;
$tokensFile = $ARGV[1] ;

$grammar = &readFile($grammarFile) ;
$tokens = &readFile($tokensFile) ;

$grammar =~ s!DOCUMENT START!! ;
# $grammar =~ s!NON-TERMINALS!! ;
$grammar =~ s!DOCUMENT END!! ;
$grammar =~ s!TOKENS.*NON-TERMINALS!!s ;

$grammar =~ s!//.*!!g ;
$grammar =~ s!\r!!g ;

# remove leading whitespace
$grammar =~ s!^[\n\s]*!\n! ;

# Merge alts
$grammar =~ s!\n\s*\|!\ |!g ;

$tokens =~ s!//.*!!g ;
$tokens =~ s!\r!!g ;


## Grammar

@g = split(/\n\s*/, $grammar) ;

@rules = () ;
%ruleMap = () ;
%tokenMap = () ;
%inline = () ;

# Grammar rules
# Direct from "jjdoc -TEXT=true"

for $g (@g) {
    ($rulename, $rulebody) = split(/:=/,$g) ;

    $rulename =~ s!^\s*!! ;
    $rulename =~ s!\s*$!! ;

    $rulebody =~ s!^\s*!! ;
    $rulebody =~ s!\s*$!! ;
    
    # Remove outer brackets
#    $rulebody =~ s!^\((.*)\)$!$1! ;

    next if $rulename eq '' ;

    push @rules, $rulename ;
    warn "Duplicate rule (grammar): $rulename\n" if defined($ruleMap{$rulename}) ;
    $ruleMap{$rulename} = $rulebody ;

##     print "----------\n" ;
##     print $rulename,"\n" ;
##     print $rulebody,"\n" ;
}

# Tokens
# Produced by "jj2tokens"
# Hand edited to indicate the inlines

$tokens =~ s/\n+/\n/g ;
$tokens =~ s/^\n// ;

@t = split(/\n(?=\<|\[)/, $tokens) ;

for $t (@t) {
    ($tokenname,$tokenbody) = split(/::=/, $t) ;
    $tokenname =~ s!^\s*!! ;
    $tokenname =~ s!\s*$!! ;
    $tokenname =~ s/#// ;

    $tokenbody =~ s!^\s*!! ;
    $tokenbody =~ s!\s*$!! ;

    # Inline?
    if ( $tokenname =~ /^\[\<\w*\>\]/ ) {
	warn "Duplicate inline (token): $tokenname\n" if defined($inline{$tokenname}) ;
	$tokenname =~ s/^\[//g ;
	$tokenname =~ s/\]$//g ;
	$tokenbody =~ s/"/'/g ; # '" -- But not literal " -- how?
        $tokenbody =~ s/\<\>\'\{\}/\<\>\"\{\}/ ; # '" IRI fixup
	$inline{$tokenname} = $tokenbody ;

	#print "INLINE: ",$tokenname," => ",$tokenbody,"\n" ;
    } else {
	## Remove < > aroudn a token name.
	my $inlinePlain = $tokenname ;
	$inlinePlain =~ s%^<%%;
	$inlinePlain =~ s%>$%%;
	$inline{$tokenname} = $inlinePlain ;
	push @rules, $tokenname ;
	warn "Duplicate rule (token): $tokenname\n" if defined($tokenMap{$tokenname}) ;
	$ruleMap{$tokenname} = $tokenbody ; 
    }
}

$ruleNum = 0 ;

for $r (@rules) {
    $DEBUG = 0 ;
    $ruleNum++ ;
    $rulename = $r ;
    $rulebody = $ruleMap{$rulename} ;

    if ( $DEBUG ) {
	print STDERR "\n" ;
	print STDERR "Rule: $rulename\n" ; 
	print STDERR "Body: $rulebody\n" ; 
    }

    $ruleBodyStr = $rulebody ;
##    # Escape HTML chars before adding markup.
#    $ruleBodyStr = esc($ruleBodyStr) ;
    
    # Inlines
    for $k (keys %inline) {
## 	$s = esc($inline{$k}) ;
## 	$k = esc($k) ;
## 	# Assumes escaped <> round tokens.
## 	$k = quotemeta $k ;
## 	$ruleBodyStr =~ s/$k/$s/g ;

	$s = $inline{$k};
	$ruleBodyStr =~ s/$k/$s/g ;
    }

    if ( $DEBUG ) {
	print STDERR "After inlining\n" ;
	print STDERR $ruleBodyStr,"\n" ; ; 
    }

    # Add hrefs - issue if one is a substring of another \W helps.

    #exit if $ruleNum > 2 ;

    ##$ruleId = sane("r".$rulename) ;

    if($rulename eq 'IRIREF') {
	print "\n";
	print "\@terminals\n";
	print "\n" ;
    }
    ## First part of output.
    ## $rlabel = '[' . $ruleNum .  ']' ;

    $rn = $rulename;
    $rn =~ s!^<!!; 
    $rn =~ s!>$!!; 
    ## Second part of output
    
    ## Third part of the output.
    $ruleBodyStr = fixupRule($rulename, $ruleBodyStr) ;

    ##Why no NL needed?
    ## printf "%-5s  %-25s ::= %s\n", $rlabel, $rn, $ruleBodyStr ;
    printf "%-25s ::= %s\n", $rn, $ruleBodyStr ;

#    $rule{$rulename, $rulebody) ;
#    print $rulename , "\n" ;
}

sub readFile {
    my $f = $_[0] ;
    open(F, "$f") || die "$f: $!"; 
    my $s = <F> ;
    return $s ;
}

sub esc {
    my $s = $_[0] ;
    $s =~ s/&/&amp;/g ; 
    $s =~ s/</&lt;/g ; 
    $s =~ s/>/&gt;/g ; 
    return $s ;
}

sub sane {
   my $a = $_[0] ;
   $a =~ s/\W//g ;
   return $a ;
}

sub fixupRule {
    my $head = $_[0] ;
    my $body = $_[1] ;

    # Remove unnecessary ()
    $body =~ s/\(\s*([^()| ]*) \)/$1/g ;

    # Remove outer matching () where there are no inner ()
    $body =~ s/^\(\s+([^\(]*)\s+\)$/$1/ ;

    # There aren't any of these
##    $body =~ s!\(\s+(\S*)\s+\)!$1!g ;

    # Remove <> around tokens.
    $body =~ s/&lt;(\w+)&gt;/$1/g ;

    # Specials
    # Split long bodies
    if ( $head eq "CallExpression" ||
	 $head eq "UnaryExpression" ||
 	 $head eq "<PN_CHARS_BASE>" ||
 	 $head eq "PatternElement" ||
 	 $head eq "BuiltInCall" ||
	 $head eq "Aggregate" )
    {
	## Except BNODE, STRLEN etc
	## [138]  	RegexExpression	  ::=  	'REGEX' '(' Expression ',' Expression ( ',' Expression )? ')'
	## [139]  	SubstringExpression	  ::=  	'SUBSTR' '(' Expression ',' Expression ( ',' Expression )? ')'
	## [140]  	StrReplaceExpression	  ::=  	'REPLACE' '(' Expression ',' Expression ',' Expression ( ',' Expression )? ')'
	## [141]  	ExistsFunc	  ::=  	'EXISTS' GroupGraphPattern
	## [142]  	NotExistsFunc	  ::=  	'NOT' 'EXISTS' GroupGraphPattern
	##        | 'BNODE' ( '(' Expression ')' | <NIL> )

	print STDERR "A:",$body,"\n" if $p ;

	## Fix up for BNODE that uses "|" - put in marker.
	$body =~ s%'BNODE' \( '\(' Expression '\)' \| \<NIL\> \)%XXX-BNODE-XXX%;

	## | followed by <NIL>
	$body =~ s%\|\s*%\n                          |   %g ;
	
	## Replace marker
	$body =~ s%XXX-BNODE-XXX%'BNODE' \( '(' Expression ')' \| \<NIL\> \)%;
    }

     if ( $head eq "Aggregate" )
     {
	 # Strip outer ()
	 $body =~ s/^\(\s*(.*)\s*\)$/$1/ ;	
 	 $body =~ s/^\s+// ;
     }

    # These failed the outer () test because they have nested () in them
    if (  $head eq "QueryPattern" ||
          $head eq "OrderCondition" )
    {
	# Remove outer ()
	$body =~ s/^\((.*)\)$/$1/ ;	
    }

    ## Split?
##     if (  $head eq "Query" )
##     {
##     }

    #Rules where an outer () is unnecessary.
    if ( $head eq "GroupCondition" ||
	 $head eq "LimitOffsetClauses" ||
	 $head eq "GraphOrDefault" ||
	 $head eq "ArgList" ||
	 $head eq "ExpressionList" ||
	 $head eq "PathPrimary" ||
	 $head eq "PathMod" ||
	 $head eq "PathPrimary" ||
	 $head eq "PathNegatedPropertySet" || 
	 $head eq "PathOneInPropertySet")
    {
	$body =~ s/^\(\s*(.*)\s*\)$/$1/ ;	
    }

    return $body ;
}
