| 204 | print "pattern.es.parser.find_lemmata()" |
| 205 | |
| 206 | def test_parse(self): |
| 207 | # Assert parsed output with Penn Treebank II tags (slash-formatted). |
| 208 | # "el gato negro" is a noun phrase, "en la alfombra" is a prepositional noun phrase. |
| 209 | v = es.parser.parse(u"El gato negro se sentó en la alfombra.") |
| 210 | self.assertEqual(v, # XXX - shouldn't "se" be part of the verb phrase? |
| 211 | u"El/DT/B-NP/O gato/NN/I-NP/O negro/JJ/I-NP/O " + \ |
| 212 | u"se/PRP/B-NP/O sentó/VB/B-VP/O " + \ |
| 213 | u"en/IN/B-PP/B-PNP la/DT/B-NP/I-PNP alfombra/NN/I-NP/I-PNP ././O/O" |
| 214 | ) |
| 215 | # Assert the accuracy of the Spanish tagger. |
| 216 | i, n = 0, 0 |
| 217 | for sentence in open(os.path.join(PATH, "corpora", "tagged-es-wikicorpus.txt")).readlines(): |
| 218 | sentence = sentence.decode("utf-8").strip() |
| 219 | s1 = [w.split("/") for w in sentence.split(" ")] |
| 220 | s2 = [[w for w, pos in s1]] |
| 221 | s2 = es.parse(s2, tokenize=False, tagset=es.PAROLE) |
| 222 | s2 = [w.split("/") for w in s2.split(" ")] |
| 223 | for j in range(len(s1)): |
| 224 | if s1[j][1] == s2[j][1]: |
| 225 | i += 1 |
| 226 | n += 1 |
| 227 | print float(i) / n |
| 228 | self.assertTrue(float(i) / n > 0.92) |
| 229 | print "pattern.es.parser.parse()" |
| 230 | |
| 231 | def test_tag(self): |
| 232 | # Assert [("el", "DT"), ("gato", "NN"), ("negro", "JJ")]. |