#!/usr/bin/perl
## Licensed to the Apache Software Foundation (ASF) under one
## or more contributor license agreements.  See the NOTICE file
## distributed with this work for additional information
## regarding copyright ownership.  The ASF licenses this file
## to you under the Apache License, Version 2.0 (the
## "License"); you may not use this file except in compliance
## with the License.  You may obtain a copy of the License at
##
##     http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.

# Grammar into HTML
# Read in sparql.txt and the tokens.txt file

## ToDo:
## Check tokens exist and are used
## Validate


if ( $#ARGV != 1 )
{
    print STDERR "Usage: grammar.txt tokens.txt\n" ;
    exit 1 ;
}

$/ = undef ;
# Just table or full page.
$TABLE = 1 ;

$grammarFile = $ARGV[0] ;
$tokensFile = $ARGV[1] ;

$grammar = &readFile($grammarFile) ;
$tokens = &readFile($tokensFile) ;

$grammar =~ s!DOCUMENT START!! ;
# $grammar =~ s!NON-TERMINALS!! ;
$grammar =~ s!DOCUMENT END!! ;
$grammar =~ s!TOKENS.*NON-TERMINALS!!s ;

$grammar =~ s!//.*!!g ;
$grammar =~ s!\r!!g ;

# remove leading whitespace
$grammar =~ s!^[\n\s]*!\n! ;

# Merge alts
$grammar =~ s!\n\s*\|!\ |!g ;

$tokens =~ s!//.*!!g ;
$tokens =~ s!\r!!g ;


## Grammar
#print "GRAMMAR\n" ;

@g = split(/\n\s*/, $grammar) ;

@rules = () ;
%ruleMap = () ;
%tokenMap = () ;
%inline = () ;

# Grammar rules
# Direct from "jjdoc -TEXT=true"

for $g (@g)
{
    ($rulename, $rulebody) = split(/:=/,$g) ;

    $rulename =~ s!^\s*!! ;
    $rulename =~ s!\s*$!! ;

    $rulebody =~ s!^\s*!! ;
    $rulebody =~ s!\s*$!! ;
    
    # Remove outer brackets
#    $rulebody =~ s!^\((.*)\)$!$1! ;

    # Remove <> around tokens in grammar.
    ## Now done very late (as &lt;&gt;) in fixups.
    ## $rulebody =~ s/\<(\w+)\>/$1/g ;
    # Leave in - so tokens distinguished from rules

    next if $rulename eq '' ;
    #next if $rulebody eq '' ;

    # Skip the root rule.
    next if ( $rulename eq 'CompilationUnit' ) ;

    $rulebody = 'Perl 5 regular expression'
	if ( $rulename eq 'PatternLiteral' ) ;

    push @rules, $rulename ;
    warn "Duplicate rule (grammar): $rulename\n" if defined($ruleMap{$rulename}) ;
    $ruleMap{$rulename} = $rulebody ;

##     print "----------\n" ;
##     print $rulename,"\n" ;
##     print $rulebody,"\n" ;

}


# Tokens
# Produced by "jj2tokens"
# Hand edited to indicate the inlines

$tokens =~ s/\n+/\n/g ;
$tokens =~ s/^\n// ;

@t = split(/\n(?=\<|\[)/, $tokens) ;

for $t (@t)
{
    ($tokenname,$tokenbody) = split(/::=/, $t) ;
    $tokenname =~ s!^\s*!! ;
    $tokenname =~ s!\s*$!! ;

##     # remove <> around tokens
## Do very late as a formatting fix up.
##     $tokenname =~ s/^\<// ;
##     $tokenname =~ s/\>$// ;

    $tokenname =~ s/#// ;
    
    $tokenbody =~ s!^\s*!! ;
    $tokenbody =~ s!\s*$!! ;
    
    # <> round tokens
    # Remove at last minute.

    # Remove outer ()
    # $tokenbody =~ s!^\((.*)\)$!$1! ;

    # Inline?
    if ( $tokenname =~ /^\[\<\w*\>\]/ )
    {
	warn "Duplicate inline (token): $tokenname\n" if defined($inline{$tokenname}) ;
	$tokenname =~ s/^\[//g ;
	$tokenname =~ s/\]$//g ;
	$tokenbody =~ s/"/'/g ; # '" -- But not literal " -- how?
        $tokenbody =~ s/\<\>\'\{\}/\<\>\"\{\}/ ; # '" IRI fixup

	$inline{$tokenname} = $tokenbody ;

	#print "INLINE: ",$tokenname," => ",$tokenbody,"\n" ;
    }
    else
    {
	push @rules, $tokenname ;
	warn "Duplicate rule (token): $tokenname\n" if defined($tokenMap{$tokenname}) ;
	$ruleMap{$tokenname} = $tokenbody ; 
    }
}

# Table

if ( ! $TABLE )
{
    print "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" ;
    print "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n" ;
    print "    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" ;
    print "\n" ;

    print "<html>\n";
    print "<head>\n";
    print "<title>SPARQL Grammar</title>\n" ;
    print "<style type=\"text/css\">\n" ;

 # .token inline
 # .ruleHead
 # .ruleBody

    print <<'EOF' ;
div.grammarTable table * { border-width: 0 ; }
div.grammarTable table * tr { border: 1px solid black ; }

.grammar     { text-align: left ; vertical-align: top ; }
.token       { color: #3f3f5f; }
.gRuleHead   { font-style: italic ; font-family: monospace ; }
.gRuleBody   { font-family: monospace ; }
.gRuleLabel  { font-family: monospace ; }
EOF

     print "</style>\n" ;
     print "</head>\n";
     print "<body>\n";

    print "\n" ;
}

print "<div class=\"grammarTable\">\n" ;
print "  <table><tbody>\n" ;

$ruleNum = 0 ;

for $r (@rules)
{
    $DEBUG = 0 ;
    $ruleNum++ ;
    $rulename = $r ;
    
    $rulebody = $ruleMap{$rulename} ;

##    $DEBUG = 1 if ( $rulename =~ /Prolog/ ) ;

    $rb = $rulebody ;

    if ( $DEBUG )
    {
	print STDERR "\n" ;
	print STDERR "Rule: $rulename\n" ; 
	print STDERR "Body: $rulebody\n" ; 
    }

    ## Do before '||' substitution
    # Not perfect - some fixups later.
    #$rb =~ s%\|%\<br/\>\|%g ;

    # Escape HTML chars before adding markup.
    $rb = esc($rb) ;
    
    # Inlines
    for $k (keys %inline)
    {
	$s = span('token', $inline{$k}) ;
	$k = esc($k) ;
	# Assumes escaped <> round tokens.
	$k = quotemeta $k ;
	$rb =~ s/$k/$s/g ;

    }

    if ( $DEBUG )
    {
	print STDERR "After inlining\n" ;
	print STDERR $rb,"\n" ; ; 
    }


    # Add hrefs - issue if one is a substring of another \W helps.

    for $k (keys %ruleMap)
    {
	$s = href("r-".$k,$k) ;

	$k = esc($k) ;
	$k = quotemeta $k ;

## 	if ( $DEBUG )
## 	{
## 	  print STDERR "K:$k\n" ;  
## 	}


	$rb =~ s/(?=\W)(\s*)$k(\s*)(?=\W)/$1$s$2/g ;
	$rb =~ s/^$k(\s*)(?=\W)/$s$1/g ;
	$rb =~ s/(?=\W)(\s*)$k$/$1$s/g ;
	$rb =~ s/^$k$/$s/g ;
    }
    
    if ( $DEBUG )
    {
	print STDERR "After hrefs\n" ;
	print STDERR $rb,"\n" ; ; 
    }

    #exit if $ruleNum > 2 ;

    $rn = anchor("r-".$rulename, $rulename) ;
    $rn = fixupHead($rn) ;

    if($rulename eq 'IRIREF') 
    {
	print "  </tbody></table>\n" ;
	print "</div>\n" ;
	print "<p>Productions for terminals:</p>\n" ;
	print "<div class=\"grammarTable\">\n" ;
	print "  <table><tbody>\n" ;
    }


    print "\n" ;
    print "<tr valign=\"baseline\">\n" ;
    $rlabel = '[' . $ruleNum .  ']&nbsp;&nbsp;' ;

    print "  <td>",code('gRuleLabel', $rlabel),"</td>\n" ;

    #print "  <td>",span('gRuleHead', $rn),"</td>\n" ;
    print "  <td>",code('gRuleHead',$rn),"</td>\n" ;

    print "  <td>&nbsp;&nbsp;::=&nbsp;&nbsp;</td>\n" ;
    
    $rb = fixupRule($rulename, $rb) ;
    print "  <td>",code('gRuleBody',$rb),"</td>\n" ;

    print "</tr>\n" ;

#    $rule{$rulename, $rulebody) ;
#    print $rulename , "\n" ;
}

print "  </tbody></table>\n" ;
print "</div>\n" ;

if ( !$TABLE )
{
    print "\n" ;
    print "</body>\n" ;
    print "</html>\n" ;
}

sub readFile
{
    my $f = $_[0] ;
    open(F, "$f") || die "$f: $!"; 
    my $s = <F> ;
    return $s ;
}

sub esc
{
    my $s = $_[0] ;
    $s =~ s/&/&amp;/g ; 
    $s =~ s/</&lt;/g ; 
    $s =~ s/>/&gt;/g ; 
    return $s ;
}

sub span
{
    my $c = $_[0] ;
    my $t = $_[1] ;
    $t = esc($t) ;
    my $s = '<span class="' . $c . '">' . $t . '</span>' ;
    return $s ;
}

sub href
{
    my $a = $_[0] ;
    my $t = $_[1] ;
    $a = sane($a) ;
    $t = esc($t) ;
    my $s = '<a href="#' . $a . '">' . $t . '</a>' ;
    return $s ;
}

sub anchor
{
    my $a = $_[0] ;
    my $t = $_[1] ;
    $a = sane($a) ;
    $t = esc($t) ;
    my $s = '<a id="' . $a . '" name="' . $a . '">' . $t . '</a>' ;
    return $s ;
}

sub sane
{
   my $a = $_[0] ;
   $a =~ s/\W//g ;
   return $a ;
}

sub code
{
    my $c = $_[0] ;
    my $t = $_[1] ;
    return '<code class="' . $c . '">' . $t . '</code>' ;
}

sub fixupHead
{
    my $head = $_[0] ;
    # Remove <> around tokens.
    $head =~ s/&lt;(\w+)&gt;/$1/g ;
    return $head ;
}

sub fixupRule
{
    my $head = $_[0] ;
    my $body = $_[1] ;

    # Remove unnecessary ()
    $body =~ s/\(\s*([^()| ]*) \)/$1/g ;

##     if ( $body =~ m!\(\s+(\<a[^>]*\>[^<>]*\</a\>)\s+\)! )
##     {
## 	$b = $body ;
## 	print "================================\n" ;
## 	print STDERR "$b\n" ;
## 	print STDERR "--------\n" ;
## 	$b =~ s!\(\s+(\<a[^>]*\>[^<>]*\</a\>)\s+\)!$1!g ;
## 	$b =~ s!\(\s+(\<span[^>]*\>[^<>]*\</span\>)\s+\)!$1!g ;
## 	print STDERR "$b\n" ;
## 	print STDERR "=====\n" ;
## 	print STDERR "\n" ;
##     }


    # Remove outer matching () where there are no inner ()
    $body =~ s/^\(\s+([^\(]*)\s+\)$/$1/ ;

    # ( A )* => A* and for + and ? where A is a linked or spanned object
    $body =~ s!\(\s+(\<a[^>]*\>[^<>]*\</a\>)\s+\)!$1!g ;
    $body =~ s!\(\s+(\<span[^>]*\>[^<>]*\</span\>)\s+\)!$1!g ;

    # There aren't any of these
##    $body =~ s!\(\s+(\S*)\s+\)!$1!g ;

    # Remove <> around tokens.
    $body =~ s/&lt;(\w+)&gt;/$1/g ;

    # Specials
    # Split long bodies
    if ( $head eq "CallExpression" ||
	 $head eq "UnaryExpression" ||
	 $head eq "<NCCHAR1p>" ||
	 $head eq "PatternElement" ||
	 $head eq "BuiltInCall" )
    {
	$body =~ s%\|%\<br/\>\|%g ;
	$body =~ s/^\s+// ;
	$body = "&nbsp;&nbsp;".$body ;
    }

     if ( $head eq "Aggregate" )
     {
	 # Strip outer ()
	 $body =~ s/^\(\s*(.*)\s*\)$/$1/ ;	
	 ## Be careful of nested | in COUNT
 	 $body =~ s%(\| \<span class="token")%\<br/\>$1%g ;
 	 $body =~ s/^\s+// ;
 	 $body = "&nbsp;&nbsp;".$body ;
     }


    if (  $head eq "BuiltInCall" )
    {
	# Undo <br/> for BNODE, RAND
	# <br/>| <a href="#rNIL">NIL</a> )
	$body =~ s%\<br/\>\| *\<a href="#rNIL"\>NIL\</a\>%\| \<a href="#rNIL"\>NIL\</a\>%g ;
    }

    if ( $head eq "RelationalExpression" ||
	 $head eq "AdditiveExpression" ||
	 $head eq "MultiplicativeExpression" ||
	 $head eq "ConditionalOrExpression")
    {
	$body =~ s%\*\(%<br/>\(% ;
    }

    # These failed the outer () test because they have nested () in them
    if (  $head eq "QueryPattern" ||
          $head eq "OrderCondition" )
    {
	# Remove outer ()
	$body =~ s/^\((.*)\)$/$1/ ;	
    }

    if (  $head eq "Query" )
    {
	$body =~ s! \(!<br/>\(! ;
	$body =~ s!\) !\)<br/>! ;
    }

    if (  $head =~ m/(Select|Construct|Describe|Ask)Query/ )
    {
	# Put a line break before the DatasetClause
	# <a href="#rDatasetClause">DatasetClause</a>
	$c = '<a href="#rDatasetClause">DatasetClause</a>' ;
	$c = quotemeta $c ;
	# Expects the dataset clause to be unbracketted
	$body =~ s!(\(\s*$c)!<br/>$1! ;
    }

    if ( $head eq "OrderCondition" )
    {
	$body =~ s!\)\s*\|\s*\(!\)<br/>\| \(! ;
	$body = "  ".$body ;
    }

    #Rules where an outer () is unnecessary.
    if ( $head eq "GroupCondition" ||
	 $head eq "LimitOffsetClauses" ||
	 $head eq "GraphOrDefault" ||
	 $head eq "ArgList" ||
	 $head eq "ExpressionList" ||
	 $head eq "PathPrimary" ||
	 $head eq "PathMod" ||
	 $head eq "PathPrimary" ||
	 $head eq "PathNegatedPropertySet" || 
	 $head eq "PathOneInPropertySet")
    {
	$body =~ s/^\(\s*(.*)\s*\)$/$1/ ;	
    }
    return $body ;
}
