#!/bin/sh
# The next line is executed by /bin/sh, but not tcl \
exec tclsh "$0" ${1+"$@"}

## This script will generate sample sentences out of a given grammar.
## Author: Georgios Petasis, petasis@iit.demokritos.gr
##   2/09/2002

proc Usage {} {
  global argv0
  puts stderr "Usage: $argv0 grammar-file \[number-of-iterations]"
  exit -1
}

if {$argc != 1 && $argc != 2} {
  Usage
}

set MaxBodyLen 40

## Read the grammar from the specified file and save it in a global array...
set GrammarName [lindex $argv 0]
set Grammar [open $GrammarName]
set GrammarName [file rootname $GrammarName]

proc ReadGrammar {Grammar} {
  global Rule Head Frequency
  while {[gets $Grammar line] > 0} {
    set rule [split [regsub -all {_[0-9]+} [string trim $line] {}]]
    if {![llength $rule]} {continue}
  
    ## Ignore "Rule"
    set head [lindex $rule 1]
    set PBody [lrange $rule 3 end]
    ## Has the rule a frequency marker?
    if {[string match {\[*\]} [lindex $PBody end]]} {
      set PBody [lreplace $PBody end end]
    }
    set Body $PBody
    lappend Rule($head) $Body
    lappend Head($Body) $head
    set Frequency($head->$Body) 0
  };# while {1}
  ## Ensure no duplicate rules exist...
  foreach head [array names Rule] {
    set Rule($head) [lsort -unique $Rule($head)]
  }
  foreach body [array names Head] {
    set Head($body) [lsort -unique $Head($body)]
  }
};# ReadGrammar
ReadGrammar $Grammar
close $Grammar
parray Rule


## SelectBody
#    Select and return a rule body that has as head the provided symbol.
#    Selection is done with equal probability among the alternatives...
proc SelectBody {head} {
  global Rule Head Frequency
  set bodies $Rule($head)
  if {[llength $bodies] == 1} {
    set best_body [lindex $bodies 0]
    incr Frequency($head->$best_body)
    return $best_body
  }

  ## Re-sort randomly the rule bodies...
  for {set i 0} {$i < 1} {incr i} {
    foreach body $bodies {
      set Randomiser([expr {rand()*1000.0}]) $body
    }
    set bodies {}
    foreach one [lsort -real [array names Randomiser]] {
      lappend bodies $Randomiser($one)
    }
  }
  
  ## The following block selects a rule body randomly. It should be good, as
  ## the Tcl rand() function probability is flat...
  set index [expr {int(rand()*[llength $bodies])}]
  set best_body [lindex $bodies $index]
  incr Frequency($head->$best_body)
  #return $best_body
  
  ## The following block assigns frequencies to rule bodies...
  ## Find the combination of head -> body having the lower frequency...
  set best_body [lindex $bodies 0]
  set min $Frequency($head->$best_body)
  set selection_done 0
  foreach body $bodies {
    if {$Frequency($head->$body) < $min} {
      set min $Frequency($head->$body)
      set best_body $body
      incr selection_done
    }
  }
  ## Update the frequency of the selected rule...
  incr Frequency($head->$best_body)
  return $best_body
};# SelectBody

## GenerateSentence
#    Walk through the grammar, by recursivelly expanding all non-terminal
#    symbols to generate a sentence...
proc GenerateSentence {Symbol} {
  global Rule CurrentLevel MaxBodyLen
  if {$CurrentLevel > $MaxBodyLen} {error "max body length exceeded!"}
  incr CurrentLevel

  ## Choose randomly what path we will follow, if we have more than one
  ## possible paths...
  if {![info exists Rule($Symbol)]} {
    ## This is a terminal symbol!

    ## Look in the caller stack to see what was the previous symbol that
    ## generated the terminal...
    global TerminalSymbols
    if {![info exists TerminalSymbols($Symbol)]} {
      puts "Terminal \"$Symbol\""
      puts stderr "Terminal \"$Symbol\""
      set TerminalSymbols($Symbol) 1
    }
    upvar Symbol NT
    return [list $Symbol $NT]
  }

  set Res {}
  foreach symbol [SelectBody $Symbol] {
    set Res [concat $Res [GenerateSentence $symbol]]
  }
  return $Res
};# GenerateSentence

## Now, genarate sentences :-)
proc GenerateSentences {iterations} {
  global Rule Sentence CurrentLevel
  for {set i 0} {$i < $iterations} {incr i} {
    set sentence {}
    set CurrentLevel 0
    if {[catch {GenerateSentence S} sentence]} {
      ## We have exausted the tcl recursion limit. This probably means that we
      ## have a highly ambiguous grammar...
      puts stderr "Highly ambiguous grammar? ($sentence)"
      incr i -1
      continue
    }
    if {[info exists Sentence($sentence)]} {
      incr Sentence($sentence)
    } else {
      set Sentence($sentence) 1
    }
    puts stderr "$i -> $sentence"
  }
};# GenerateSentences

set iterations 500000
if {$argc > 1} {
  set iterations [lindex $argv 1]
}

GenerateSentences $iterations
## Write the generated sentences...

set OutFileRoot [file join [file dirname $GrammarName] \
                Initial_[file rootname [file tail $GrammarName]]]
set G [open $OutFileRoot.grm w]
set C [open $OutFileRoot.cat w]
set L [open $OutFileRoot.lex w]
set S [open $OutFileRoot.txt w]
foreach s [array names Sentence] {
  ## Do not print empty sentences...
  if {![llength [string trim $s]]} {continue}
  puts -nonewline $G "Rule S -> "
  puts -nonewline $S "Rule S -> "
  foreach {terminal NT} [string trim $s] {
    puts -nonewline $G "$NT "
    puts -nonewline "$terminal "
    puts -nonewline $S "$terminal "
    lappend Categories($terminal) $NT
  }
  puts $G {}
  puts $S {}
  puts {}
}
close $G

foreach terminal [lsort -dictionary [array names Categories]] {
  foreach cat [lsort -dictionary -unique $Categories($terminal)] {
    puts $C "Rule $cat -> $terminal"
    puts $L "\\w $terminal\n\\c $cat\n"
  }
}
close $C
close $L
close $S
parray Frequency
parray TerminalSymbols
puts stderr "Generated [llength [array names Sentence]] Sentences out of $iterations iterations..."
exit 0

