]> hydra-www.ietfng.org Git - dyna2/commitdiff
Progress on Penn Treebank benchmarks/experiments.
authortimv <tim.f.vieira@gmail.com>
Mon, 10 Jun 2013 19:21:41 +0000 (15:21 -0400)
committertimv <tim.f.vieira@gmail.com>
Mon, 10 Jun 2013 19:21:41 +0000 (15:21 -0400)
examples/ptb.dyna [new file with mode: 0644]
src/Dyna/Backend/Python/utils.py

diff --git a/examples/ptb.dyna b/examples/ptb.dyna
new file mode 100644 (file)
index 0000000..b4877f5
--- /dev/null
@@ -0,0 +1,139 @@
+sentence(0) :=
+    &t("S", &t("NP-SBJ", &t("NNP", "Rolls-Royce"),
+                         &t("@NP-SBJ", &t("NNP", "Motor"),
+                                       &t("@@NP-SBJ", &t("NNPS", "Cars"),
+                                                      &t("NNP", "Inc.")))),
+            &t("@S", &t("VP", &t("VBD", "said"),
+                              &t("SBAR", &t("-NONE-", "0"),
+                                         &t("S", &t("NP-SBJ", &t("PRP", "it")),
+                                                 &t("VP", &t("VBZ", "expects"),
+                                                          &t("S", &t("NP-SBJ", &t("PRP$", "its"),
+                                                                               &t("@NP-SBJ", &t("NNP", "U.S."),
+                                                                                             &t("NNS", "sales"))),
+                                                                  &t("VP", &t("TO", "to"),
+                                                                           &t("VP", &t("VB", "remain"),
+                                                                                    &t("@VP", &t("ADJP-PRD", &t("JJ", "steady")),
+                                                                                              &t("@@VP", &t("PP-LOC-CLR", &t("IN", "at"),
+                                                                                                                          &t("NP", &t("QP", &t("IN", "about"),
+                                                                                                                                            &t("CD", "1,200")),
+                                                                                                                                   &t("NNS", "cars"))),
+                                                                                                         &t("PP-TMP", &t("IN", "in"),
+                                                                                                                      &t("NP", &t("CD", "1990")))))))))))),
+                     &t(".", "."))).
+
+sentence(1) :=
+    &t("S", &t("NP-SBJ", &t("DT", "The"),
+                         &t("@NP-SBJ", &t("NN", "luxury"),
+                                       &t("@@NP-SBJ", &t("NN", "auto"),
+                                                      &t("NN", "maker")))),
+            &t("@S", &t("NP-TMP", &t("JJ", "last"),
+                                  &t("NN", "year")),
+                     &t("VP", &t("VBD", "sold"),
+                              &t("@VP", &t("NP", &t("CD", "1,214"),
+                                                 &t("NNS", "cars")),
+                                        &t("PP-LOC", &t("IN", "in"),
+                                                     &t("NP", &t("DT", "the"),
+                                                              &t("NNP", "U.S."))))))).
+
+sentence(2) :=
+    &t("S", &t("NP-SBJ", &t("NP", &t("NNP", "Howard"),
+                                  &t("NNP", "Mosher")),
+                         &t("@NP-SBJ", &t(",", ","),
+                                       &t("@@NP-SBJ", &t("NP", &t("NP", &t("NN", "president")),
+                                                               &t("@NP", &t("CC", "and"),
+                                                                         &t("NP", &t("JJ", "chief"),
+                                                                                  &t("@NP", &t("NN", "executive"),
+                                                                                            &t("NN", "officer"))))),
+                                                      &t(",", ",")))),
+            &t("@S", &t("VP", &t("VBD", "said"),
+                              &t("SBAR", &t("-NONE-", "0"),
+                                         &t("S", &t("NP-SBJ", &t("PRP", "he")),
+                                                 &t("VP", &t("VBZ", "anticipates"),
+                                                          &t("NP", &t("NP", &t("NN", "growth")),
+                                                                   &t("@NP", &t("PP", &t("IN", "for"),
+                                                                                      &t("NP", &t("DT", "the"),
+                                                                                               &t("@NP", &t("NN", "luxury"),
+                                                                                                         &t("@@NP", &t("NN", "auto"),
+                                                                                                                    &t("NN", "maker"))))),
+                                                                             &t("PP-LOC", &t("PP", &t("IN", "in"),
+                                                                                                   &t("NP", &t("NNP", "Britain"),
+                                                                                                            &t("@NP", &t("CC", "and"),
+                                                                                                                      &t("NNP", "Europe")))),
+                                                                                          &t("@PP-LOC", &t(",", ","),
+                                                                                                        &t("@@PP-LOC", &t("CC", "and"),
+                                                                                                                       &t("PP", &t("IN", "in"),
+                                                                                                                                &t("NP", &t("ADJP", &t("JJ", "Far"),
+                                                                                                                                                    &t("JJ", "Eastern")),
+                                                                                                                                         &t("NNS", "markets")))))))))))),
+                     &t(".", "."))).
+
+%toString(A) :=
+%  needs(A),
+%  A.
+%
+%toString(&t(A)) :=
+%  needs(&t(A)),
+%  A.
+%
+%toString(&t(A,B)) :=
+%  needs(&t(A,B)),
+%  mod("(%s %s)",
+%      tuple(toString(A),
+%            toString(B))).
+%
+%toString(&t(A,B,C)) :=
+%  needs(&t(A,B,C)),
+%  mod("(%s %s %s)",
+%      tuple(toString(A),
+%            toString(B),
+%            toString(C))).
+
+needs(A) |= needs(&t(A,B)).
+needs(B) |= needs(&t(A,B)).
+
+needs(A) |= needs(&t(A,B,C)).
+needs(B) |= needs(&t(A,B,C)).
+needs(C) |= needs(&t(A,B,C)).
+
+%needs(sentence(0)).
+%needs(sentence(1)).
+%needs(sentence(2)).
+
+%zzz(0) := toString(sentence(0)).
+%zzz(1) := toString(sentence(1)).
+%zzz(2) := toString(sentence(2)).
+
+%sym(&t(X,_)) := X.
+%sym(&t(X,_,_)) := X.
+
+% the following rule order is important bc we use :=.
+sym(A) := needs(A), A.
+sym(&t(A,B)) := needs(&t(A,B)), A.
+sym(&t(A,B,C)) := needs(&t(A,B,C)), A.
+
+% rule used to create top-level subtree
+rules(&t(X,Y)) := needs(&t(X,Y)), &r(X, sym(Y)).
+rules(&t(X,Y,Z)) := needs(&t(X,Y,Z)), needs(Z), &r(X, sym(Y), sym(Z)).
+
+% unnormalized
+c(X,Y) += r(X,Y) is rules(&t(X1,Y1)), 1.
+c(X,Y,Z) += r(X,Y,Z) is rules(&t(X1,Y1,Z1)), 1.
+
+% normalizing constants
+n(X) += c(X,Y).
+n(X) += c(X,Y,Z).
+
+% normalize
+p(X,Y) := c(X,Y) / n(X).
+p(X,Y,Z) := c(X,Y,Z) / n(X).
+
+% activate fake backchaining.
+needs(sentence(0)).
+needs(sentence(1)).
+needs(sentence(2)).
+
+% TODO: convert tree structure into word(word, position, sentence).
+
+% TODO: augment CKY rules with sentence index
+
+% TODO: implement constituent recall accuracy measure for two trees
index fcaadd22f90eb0719000fce01b8dd76321df7692..fed7af0a80fa3595290479815748cc83e71d1cbc 100644 (file)
@@ -152,10 +152,68 @@ if __name__ == '__main__':
 
     def t(xs):
         if isinstance(xs, basestring):
-            return '"%s"' % xs
+#            return '"%s"' % xs
+            return xs
         else:
-            return '&t(%s)' % ','.join(t(x) for x in xs)
+            assert len(xs) > 1
+            if len(xs) == 2:
+                [sym, a] = map(t, xs)
+#                return '&t(%s)' % ', '.join(t(x) for x in xs)
+                return [sym, a]
+            elif len(xs) == 3:
+                [sym, a, b] = map(t, xs)
+#                return '&t(%s, %s, %s)' % (sym, a, b)
+                return [sym, a, b]
+            else:
+                [sym, a] = t(xs[0]), t(xs[1])
+                rest = t(['@' + xs[0]] + xs[2:])
+#                return '&t(%s, %s, %s)' % (sym, a, rest)
+                return [sym, a, rest]
+
+
+    def check_binary(x):
+        if isinstance(x, basestring):
+            return True
+        elif len(x) in (2, 3):
+            return all(map(check_binary, x))
+        else:
+            return False
+
+    from cStringIO import StringIO
+
+    def pretty(t, initialindent=0):
+        "Pretty print tree as a tabbified s-expression."
+        f = StringIO()
+        out = f.write
+        def pp(t, indent=initialindent, indentme=True):
+            if indentme:
+                out(' '*indent)
+            if isinstance(t, basestring):                    # base case
+                return out('"%s"' % t)
+            if len(t) == 1:
+                if t[0]:
+                    pp('"%s"' % t[0], indent, indentme)
+                return
+            label, children = t[0], t[1:]
+
+            label = '"%s"' % label
+
+            assert isinstance(label, basestring)
+            out('&t(%s, ' % label)
+            n = len(children)
+            for i, child in enumerate(children):
+                pp(child, indent + len(label) + 5, i != 0)   # first child already indented
+                if i != n-1:                                 # no newline after last child
+                    out(',\n')
+            out(')')
+        pp(t)
+        out('\n')
+        return f.getvalue()
 
     for i, [x] in enumerate(parse_sexpr(sys.stdin.read())):
-        print 'sentence(%s) := %s.' % (i, t(x))
+        btree = t(x)
+
+        assert check_binary(btree)
+        print 
+        print 'sentence(%s) :=\n%s.' % (i, pretty(btree, 4).rstrip())
         print