Am trying to generate sentences from a defined grammar with python, to avoid agreement problem I used feature structures,
This is the code I have done so far:
>>> from __future__ import print_function
>>> import nltk
>>> from nltk.featstruct import FeatStruct
>>> from nltk import grammar, parse
>>> from nltk.parse.generate import generate
>>> from nltk import CFG
>>> g = """
% start DP
DP-> D[AGR=[NUM='sg', PERS=3, GND='m']] N[AGR=[NUM='sg', GND='m']]
D[AGR=[NUM='sg', PERS=3, GND='f']] -> 'une' | 'la'
D[AGR=[NUM='sg', PERS=3, GND='m']] -> 'un' | 'le'
D[AGR=[NUM='pl', PERS=3]] -> 'des' | 'les'
N[AGR=[NUM='sg', GND='m']] -> 'garçon'
N[AGR=[NUM='pl', GND='m']] -> 'garçons'
N[AGR=[NUM='sg', GND='f']] -> 'fille'
N[AGR=[NUM='pl', GND='f']] -> 'filles'
"""
>>> for sentence in generate(grammar, n=30):
print(''.join(sentence))
This is the output am getting:
unegarçon
unegarçons
unefille
unefilles
lagarçon
lagarçons
lafille
lafilles
ungarçon
ungarçons
unfille
unfilles
legarçon
legarçons
lefille
lefilles
desgarçon
desgarçons
desfille
desfilles
lesgarçon
lesgarçons
lesfille
lesfilles
While am supposed to have an output like this:
un garçon
le garçon
The problems I have are:
The agreement is not working out, am having sentences that does not respect the agreement
There is no space between the two words in the sentence.
What is that I can't see?
Lets solve the easy part of the question first.
Q2. There is no space between the two words in the sentence.
You're close when it comes to the printing =)
The problem lies in how you're using the str.join
function.
>>> list_of_str = ['a', 'b', 'c']
>>> ''.join(list_of_str)
'abc'
>>> ' '.join(list_of_str)
'a b c'
>>> '|'.join(list_of_str)
'a|b|c'
Q1. The agreement is not working out, am having sentences that does not respect the agreement
First warning sign
To produce feature structure grammar with agreement, there should be a rule that contains something like D[AGR=?a] N[AGR=?a]
on the right-hand-side (RHS), e.g.
NP -> D[AGR=?a] N[AGR=?a]
With that missing there's no real agreement rule in the grammar, see http://www.nltk.org/howto/featgram.html
Now comes the gotcha!
If we look at the nltk.parse.generate
code carefully, it's merely yielding all possible combinations of the terminals and it seems like it's not caring about the feature structures: https://github.com/nltk/nltk/blob/develop/nltk/parse/generate.py
(I think that's a bug not a feature so raising an issue to the NLTK repository would be good)
So if we do this, it'll print all combinations of possible terminals (without caring for the agreement):
from nltk import grammar, parse
from nltk.parse.generate import generate
# If person is always 3rd, we can skip the PERSON feature.
g = """
DP -> D[AGR=?a] N[AGR=?a]
N[AGR=[NUM='sg', GND='m']] -> 'garcon'
N[AGR=[NUM='sg', GND='f']] -> 'fille'
D[AGR=[NUM='sg', GND='m']] -> 'un'
D[AGR=[NUM='sg', GND='f']] -> 'une'
"""
grammar = grammar.FeatureGrammar.fromstring(g)
print(list(generate(grammar, n=30)))
[out]:
[['un', 'garcon'], ['un', 'fille'], ['une', 'garcon'], ['une', 'fille']]
But if we try to parse valid and invalid sentences, the agreement rule kicks in:
from nltk import grammar, parse
from nltk.parse.generate import generate
g = """
DP -> D[AGR=?a] N[AGR=?a]
N[AGR=[NUM='sg', GND='m']] -> 'garcon'
N[AGR=[NUM='sg', GND='f']] -> 'fille'
D[AGR=[NUM='sg', GND='m']] -> 'un'
D[AGR=[NUM='sg', GND='f']] -> 'une'
"""
grammar = grammar.FeatureGrammar.fromstring(g)
parser = parse.FeatureEarleyChartParser(grammar)
trees = parser.parse('une garcon'.split()) # Invalid sentence.
print ("Parses for 'une garcon':", list(trees))
trees = parser.parse('un garcon'.split()) # Valid sentence.
print ("Parses for 'un garcon':", list(trees))
[out]:
Parses for 'une garcon': []
Parses for 'un garcon': [Tree(DP[], [Tree(D[AGR=[GND='m', NUM='sg']], ['un']), Tree(N[AGR=[GND='m', NUM='sg']], ['garcon'])])]
To achieve the agreement rule at generation, one possible solution would be to parse each generated production and keep the parse-able ones, e.g.
from nltk import grammar, parse
from nltk.parse.generate import generate
g = """
DP -> D[AGR=?a] N[AGR=?a]
N[AGR=[NUM='sg', GND='m']] -> 'garcon'
N[AGR=[NUM='sg', GND='f']] -> 'fille'
D[AGR=[NUM='sg', GND='m']] -> 'un'
D[AGR=[NUM='sg', GND='f']] -> 'une'
"""
grammar = grammar.FeatureGrammar.fromstring(g)
parser = parse.FeatureEarleyChartParser(grammar)
for tokens in list(generate(grammar, n=30)):
parsed_tokens = parser.parse(tokens)
try:
first_parse = next(parsed_tokens) # Check if there's a valid parse.
print(' '.join(first_parse.leaves()))
except StopIteration:
continue
[out]:
un garcon
une fille
I guess goal is to produce the last 2nd column of:
Without the prepositions:
from nltk import grammar, parse
from nltk.parse.generate import generate
g = """
DP -> D[AGR=?a] N[AGR=?a]
N[AGR=[NUM='sg', GND='m']] -> 'garcon'
N[AGR=[NUM='sg', GND='f']] -> 'fille'
N[AGR=[NUM='pl', GND='m']] -> 'garcons'
N[AGR=[NUM='pl', GND='f']] -> 'filles'
D[AGR=[NUM='sg', GND='m']] -> 'un'
D[AGR=[NUM='sg', GND='f']] -> 'une'
D[AGR=[NUM='sg', GND='m']] -> 'le'
D[AGR=[NUM='sg', GND='f']] -> 'la'
D[AGR=[NUM='pl', GND='m']] -> 'les'
D[AGR=[NUM='pl', GND='f']] -> 'les'
"""
grammar = grammar.FeatureGrammar.fromstring(g)
parser = parse.FeatureEarleyChartParser(grammar)
valid_productions = set()
for tokens in list(generate(grammar, n=30)):
parsed_tokens = parser.parse(tokens)
try:
first_parse = next(parsed_tokens) # Check if there's a valid parse.
valid_productions.add(' '.join(first_parse.leaves()))
except StopIteration:
continue
for np in sorted(valid_productions):
print(np)
[out]:
la fille
le garcon
les filles
les garcons
un garcon
une fille
Now to include the preposition
The TOP (aka START) of the grammar has to have more than one branch, currently the DP -> D[AGR=?a] N[AGR=?a]
rule is at the TOP, to allow for a PP
construction, we've to something like PHRASE -> DP | PP
and make the PHRASE
non-terminal the new TOP, e.g.
from nltk import grammar, parse
from nltk.parse.generate import generate
g = """
PHRASE -> DP | PP
DP -> D[AGR=?a] N[AGR=?a]
PP -> P[AGR=?a] N[AGR=?a]
P[AGR=[NUM='sg', GND='m']] -> 'du' | 'au'
N[AGR=[NUM='sg', GND='m']] -> 'garcon'
N[AGR=[NUM='sg', GND='f']] -> 'fille'
N[AGR=[NUM='pl', GND='m']] -> 'garcons'
N[AGR=[NUM='pl', GND='f']] -> 'filles'
D[AGR=[NUM='sg', GND='m']] -> 'un'
D[AGR=[NUM='sg', GND='f']] -> 'une'
D[AGR=[NUM='sg', GND='m']] -> 'le'
D[AGR=[NUM='sg', GND='f']] -> 'la'
D[AGR=[NUM='pl', GND='m']] -> 'les'
D[AGR=[NUM='pl', GND='f']] -> 'les'
"""
french_grammar = grammar.FeatureGrammar.fromstring(g)
parser = parse.FeatureEarleyChartParser(french_grammar)
valid_productions = set()
for tokens in list(generate(french_grammar, n=100)):
parsed_tokens = parser.parse(tokens)
try:
first_parse = next(parsed_tokens) # Check if there's a valid parse.
valid_productions.add(' '.join(first_parse.leaves()))
except StopIteration:
continue
for np in sorted(valid_productions):
print(np)
[out]:
au garcon
du garcon
la fille
le garcon
les filles
les garcons
un garcon
une fille
To get everything in the table:
from nltk import grammar, parse
from nltk.parse.generate import generate
g = """
PHRASE -> DP | PP
DP -> D[AGR=?a] N[AGR=?a]
PP -> P[AGR=[GND='m', NUM='sg']] N[AGR=[GND='m', NUM='sg']]
PP -> P[AGR=[GND='f', NUM='sg']] D[AGR=[GND='f', NUM='sg', DEF='d']] N[AGR=[GND='f', NUM='sg']]
PP -> P[AGR=[GND=?a, NUM='pl']] N[AGR=[GND=?a, NUM='pl']]
P[AGR=[NUM='sg', GND='m']] -> 'du' | 'au'
P[AGR=[NUM='sg', GND='f']] -> 'de' | 'à'
P[AGR=[NUM='pl']] -> 'des' | 'aux'
N[AGR=[NUM='sg', GND='m']] -> 'garcon'
N[AGR=[NUM='sg', GND='f']] -> 'fille'
N[AGR=[NUM='pl', GND='m']] -> 'garcons'
N[AGR=[NUM='pl', GND='f']] -> 'filles'
D[AGR=[NUM='sg', GND='m', DEF='i']] -> 'un'
D[AGR=[NUM='sg', GND='f', DEF='i']] -> 'une'
D[AGR=[NUM='sg', GND='m', DEF='d']] -> 'le'
D[AGR=[NUM='sg', GND='f', DEF='d']] -> 'la'
D[AGR=[NUM='pl', GND='m']] -> 'les'
D[AGR=[NUM='pl', GND='f']] -> 'les'
"""
french_grammar = grammar.FeatureGrammar.fromstring(g)
parser = parse.FeatureEarleyChartParser(french_grammar)
valid_productions = set()
for tokens in list(generate(french_grammar, n=100000)):
parsed_tokens = parser.parse(tokens)
try:
first_parse = next(parsed_tokens) # Check if there's a valid parse.
valid_productions.add(' '.join(first_parse.leaves()))
except StopIteration:
continue
for np in sorted(valid_productions):
print(np)
[out]:
au garcon
aux filles
aux garcons
de la fille
des filles
des garcons
du garcon
la fille
le garcon
les filles
les garcons
un garcon
une fille
à la fille
Beyond the table
It's also possible to produce de|a un(e) garcon|fille
, i.e.
- de un garcon
- de une fille
- a un garcon
- a une fille
But I'm not sure whether they're valid French phrases, but if they are you can underspecify the feminin singular PP rule and remove the DEF
feature:
PP -> P[AGR=[GND='f', NUM='sg']] D[AGR=[GND='f', NUM='sg', DEF='d']] N[AGR=[GND='f', NUM='sg']]
to:
PP -> P[AGR=[GND='f', NUM='sg']] D[AGR=[GND='f', NUM='sg']] N[AGR=[GND='f', NUM='sg']]
and then add an additional rule to produce male singular indefinite PP
PP -> P[AGR=[GND='f', NUM='sg']] D[AGR=[GND='m', NUM='sg', DEF='i']] N[AGR=[GND='m', NUM='sg']]
TL;DR
from nltk import grammar, parse
from nltk.parse.generate import generate
g = """
PHRASE -> DP | PP
DP -> D[AGR=?a] N[AGR=?a]
PP -> P[AGR=[GND='m', NUM='sg']] N[AGR=[GND='m', NUM='sg']]
PP -> P[AGR=[GND='f', NUM='sg']] D[AGR=[GND='f', NUM='sg']] N[AGR=[GND='f', NUM='sg']]
PP -> P[AGR=[GND='f', NUM='sg']] D[AGR=[GND='m', NUM='sg', DEF='i']] N[AGR=[GND='m', NUM='sg']]
PP -> P[AGR=[GND=?a, NUM='pl']] N[AGR=[GND=?a, NUM='pl']]
P[AGR=[NUM='sg', GND='m']] -> 'du' | 'au'
P[AGR=[NUM='sg', GND='f']] -> 'de' | 'à'
P[AGR=[NUM='pl']] -> 'des' | 'aux'
N[AGR=[NUM='sg', GND='m']] -> 'garcon'
N[AGR=[NUM='sg', GND='f']] -> 'fille'
N[AGR=[NUM='pl', GND='m']] -> 'garcons'
N[AGR=[NUM='pl', GND='f']] -> 'filles'
D[AGR=[NUM='sg', GND='m', DEF='i']] -> 'un'
D[AGR=[NUM='sg', GND='f', DEF='i']] -> 'une'
D[AGR=[NUM='sg', GND='m', DEF='d']] -> 'le'
D[AGR=[NUM='sg', GND='f', DEF='d']] -> 'la'
D[AGR=[NUM='pl', GND='m']] -> 'les'
D[AGR=[NUM='pl', GND='f']] -> 'les'
"""
french_grammar = grammar.FeatureGrammar.fromstring(g)
parser = parse.FeatureEarleyChartParser(french_grammar)
valid_productions = set()
for tokens in list(generate(french_grammar, n=100000)):
parsed_tokens = parser.parse(tokens)
try:
first_parse = next(parsed_tokens) # Check if there's a valid parse.
valid_productions.add(' '.join(first_parse.leaves()))
except StopIteration:
continue
for np in sorted(valid_productions):
print(np)
[out]:
au garcon
aux filles
aux garcons
de la fille
de un garcon
de une fille
des filles
des garcons
du garcon
la fille
le garcon
les filles
les garcons
un garcon
une fille
à la fille
à un garcon
à une fille