From 056e173484cf2819652ed51a77334aad8186293e Mon Sep 17 00:00:00 2001 From: timv Date: Mon, 10 Jun 2013 15:21:41 -0400 Subject: [PATCH] Progress on Penn Treebank benchmarks/experiments. --- examples/ptb.dyna | 139 +++++++++++++++++++++++++++++++ src/Dyna/Backend/Python/utils.py | 64 +++++++++++++- 2 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 examples/ptb.dyna diff --git a/examples/ptb.dyna b/examples/ptb.dyna new file mode 100644 index 0000000..b4877f5 --- /dev/null +++ b/examples/ptb.dyna @@ -0,0 +1,139 @@ +sentence(0) := + &t("S", &t("NP-SBJ", &t("NNP", "Rolls-Royce"), + &t("@NP-SBJ", &t("NNP", "Motor"), + &t("@@NP-SBJ", &t("NNPS", "Cars"), + &t("NNP", "Inc.")))), + &t("@S", &t("VP", &t("VBD", "said"), + &t("SBAR", &t("-NONE-", "0"), + &t("S", &t("NP-SBJ", &t("PRP", "it")), + &t("VP", &t("VBZ", "expects"), + &t("S", &t("NP-SBJ", &t("PRP$", "its"), + &t("@NP-SBJ", &t("NNP", "U.S."), + &t("NNS", "sales"))), + &t("VP", &t("TO", "to"), + &t("VP", &t("VB", "remain"), + &t("@VP", &t("ADJP-PRD", &t("JJ", "steady")), + &t("@@VP", &t("PP-LOC-CLR", &t("IN", "at"), + &t("NP", &t("QP", &t("IN", "about"), + &t("CD", "1,200")), + &t("NNS", "cars"))), + &t("PP-TMP", &t("IN", "in"), + &t("NP", &t("CD", "1990")))))))))))), + &t(".", "."))). + +sentence(1) := + &t("S", &t("NP-SBJ", &t("DT", "The"), + &t("@NP-SBJ", &t("NN", "luxury"), + &t("@@NP-SBJ", &t("NN", "auto"), + &t("NN", "maker")))), + &t("@S", &t("NP-TMP", &t("JJ", "last"), + &t("NN", "year")), + &t("VP", &t("VBD", "sold"), + &t("@VP", &t("NP", &t("CD", "1,214"), + &t("NNS", "cars")), + &t("PP-LOC", &t("IN", "in"), + &t("NP", &t("DT", "the"), + &t("NNP", "U.S."))))))). + +sentence(2) := + &t("S", &t("NP-SBJ", &t("NP", &t("NNP", "Howard"), + &t("NNP", "Mosher")), + &t("@NP-SBJ", &t(",", ","), + &t("@@NP-SBJ", &t("NP", &t("NP", &t("NN", "president")), + &t("@NP", &t("CC", "and"), + &t("NP", &t("JJ", "chief"), + &t("@NP", &t("NN", "executive"), + &t("NN", "officer"))))), + &t(",", ",")))), + &t("@S", &t("VP", &t("VBD", "said"), + &t("SBAR", &t("-NONE-", "0"), + &t("S", &t("NP-SBJ", &t("PRP", "he")), + &t("VP", &t("VBZ", "anticipates"), + &t("NP", &t("NP", &t("NN", "growth")), + &t("@NP", &t("PP", &t("IN", "for"), + &t("NP", &t("DT", "the"), + &t("@NP", &t("NN", "luxury"), + &t("@@NP", &t("NN", "auto"), + &t("NN", "maker"))))), + &t("PP-LOC", &t("PP", &t("IN", "in"), + &t("NP", &t("NNP", "Britain"), + &t("@NP", &t("CC", "and"), + &t("NNP", "Europe")))), + &t("@PP-LOC", &t(",", ","), + &t("@@PP-LOC", &t("CC", "and"), + &t("PP", &t("IN", "in"), + &t("NP", &t("ADJP", &t("JJ", "Far"), + &t("JJ", "Eastern")), + &t("NNS", "markets")))))))))))), + &t(".", "."))). + +%toString(A) := +% needs(A), +% A. +% +%toString(&t(A)) := +% needs(&t(A)), +% A. +% +%toString(&t(A,B)) := +% needs(&t(A,B)), +% mod("(%s %s)", +% tuple(toString(A), +% toString(B))). +% +%toString(&t(A,B,C)) := +% needs(&t(A,B,C)), +% mod("(%s %s %s)", +% tuple(toString(A), +% toString(B), +% toString(C))). + +needs(A) |= needs(&t(A,B)). +needs(B) |= needs(&t(A,B)). + +needs(A) |= needs(&t(A,B,C)). +needs(B) |= needs(&t(A,B,C)). +needs(C) |= needs(&t(A,B,C)). + +%needs(sentence(0)). +%needs(sentence(1)). +%needs(sentence(2)). + +%zzz(0) := toString(sentence(0)). +%zzz(1) := toString(sentence(1)). +%zzz(2) := toString(sentence(2)). + +%sym(&t(X,_)) := X. +%sym(&t(X,_,_)) := X. + +% the following rule order is important bc we use :=. +sym(A) := needs(A), A. +sym(&t(A,B)) := needs(&t(A,B)), A. +sym(&t(A,B,C)) := needs(&t(A,B,C)), A. + +% rule used to create top-level subtree +rules(&t(X,Y)) := needs(&t(X,Y)), &r(X, sym(Y)). +rules(&t(X,Y,Z)) := needs(&t(X,Y,Z)), needs(Z), &r(X, sym(Y), sym(Z)). + +% unnormalized +c(X,Y) += r(X,Y) is rules(&t(X1,Y1)), 1. +c(X,Y,Z) += r(X,Y,Z) is rules(&t(X1,Y1,Z1)), 1. + +% normalizing constants +n(X) += c(X,Y). +n(X) += c(X,Y,Z). + +% normalize +p(X,Y) := c(X,Y) / n(X). +p(X,Y,Z) := c(X,Y,Z) / n(X). + +% activate fake backchaining. +needs(sentence(0)). +needs(sentence(1)). +needs(sentence(2)). + +% TODO: convert tree structure into word(word, position, sentence). + +% TODO: augment CKY rules with sentence index + +% TODO: implement constituent recall accuracy measure for two trees diff --git a/src/Dyna/Backend/Python/utils.py b/src/Dyna/Backend/Python/utils.py index fcaadd2..fed7af0 100644 --- a/src/Dyna/Backend/Python/utils.py +++ b/src/Dyna/Backend/Python/utils.py @@ -152,10 +152,68 @@ if __name__ == '__main__': def t(xs): if isinstance(xs, basestring): - return '"%s"' % xs +# return '"%s"' % xs + return xs else: - return '&t(%s)' % ','.join(t(x) for x in xs) + assert len(xs) > 1 + if len(xs) == 2: + [sym, a] = map(t, xs) +# return '&t(%s)' % ', '.join(t(x) for x in xs) + return [sym, a] + elif len(xs) == 3: + [sym, a, b] = map(t, xs) +# return '&t(%s, %s, %s)' % (sym, a, b) + return [sym, a, b] + else: + [sym, a] = t(xs[0]), t(xs[1]) + rest = t(['@' + xs[0]] + xs[2:]) +# return '&t(%s, %s, %s)' % (sym, a, rest) + return [sym, a, rest] + + + def check_binary(x): + if isinstance(x, basestring): + return True + elif len(x) in (2, 3): + return all(map(check_binary, x)) + else: + return False + + from cStringIO import StringIO + + def pretty(t, initialindent=0): + "Pretty print tree as a tabbified s-expression." + f = StringIO() + out = f.write + def pp(t, indent=initialindent, indentme=True): + if indentme: + out(' '*indent) + if isinstance(t, basestring): # base case + return out('"%s"' % t) + if len(t) == 1: + if t[0]: + pp('"%s"' % t[0], indent, indentme) + return + label, children = t[0], t[1:] + + label = '"%s"' % label + + assert isinstance(label, basestring) + out('&t(%s, ' % label) + n = len(children) + for i, child in enumerate(children): + pp(child, indent + len(label) + 5, i != 0) # first child already indented + if i != n-1: # no newline after last child + out(',\n') + out(')') + pp(t) + out('\n') + return f.getvalue() for i, [x] in enumerate(parse_sexpr(sys.stdin.read())): - print 'sentence(%s) := %s.' % (i, t(x)) + btree = t(x) + + assert check_binary(btree) + print + print 'sentence(%s) :=\n%s.' % (i, pretty(btree, 4).rstrip()) print -- 2.50.1