Source code for nltk.tabdata
# Natural Language Toolkit: Encode/Decocode Data as Tab-files
#
# Copyright (C) 2024 NLTK Project
# Author: Eric Kafe <kafe.eric@gmail.com>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
#
[docs]
def rm_nl(s):
if s[-1] == "\n":
return s[:-1]
return s
[docs]
class TabEncoder:
[docs]
def list2txt(self, s):
return "\n".join(s)
[docs]
def set2txt(self, s):
return self.list2txt(list(s))
[docs]
def tup2tab(self, tup):
return "\t".join(tup)
[docs]
def tups2tab(self, x):
return "\n".join([self.tup2tab(tup) for tup in x])
[docs]
def dict2tab(self, d):
return self.tups2tab(d.items())
[docs]
def ivdict2tab(self, d):
# From integer-value dictionary
return self.tups2tab([(a, str(b)) for a, b in d.items()])
[docs]
class TabDecoder:
[docs]
def txt2list(self, f):
return [rm_nl(x) for x in f]
[docs]
def txt2set(self, f):
return {rm_nl(x) for x in f}
[docs]
def tab2tup(self, s):
return tuple(s.split("\t"))
[docs]
def tab2tups(self, f):
return [self.tab2tup(rm_nl(x)) for x in f]
[docs]
def tab2dict(self, f):
return {a: b for a, b in self.tab2tups(f)}
[docs]
def tab2ivdict(self, f):
# To integer-value dictionary
return {a: int(b) for a, b in self.tab2tups(f)}
# ---------------------------------------------------------------------------
# Maxent data
# ---------------------------------------------------------------------------
[docs]
class MaxentEncoder(TabEncoder):
[docs]
def tupdict2tab(self, d):
def rep(a, b):
if a == "wordlen":
return repr(b)
if b in [True, False, None]:
return f"repr-{b}"
return b
return self.tups2tab(
[(a, rep(a, b), c, repr(d)) for ((a, b, c), d) in d.items()]
)
[docs]
class MaxentDecoder(TabDecoder):
[docs]
def tupkey2dict(self, f):
def rep(a, b):
if a == "wordlen":
return int(b)
if b == "repr-None":
return None
if b == "repr-True":
return True
if b == "repr-False":
return False
return b
return {(a, rep(a, b), c): int(d) for (a, b, c, d) in self.tab2tups(f)}
# ---------------------------------------------------------------------------
# Punkt data
# ---------------------------------------------------------------------------
[docs]
class PunktDecoder(TabDecoder):
[docs]
def tab2intdict(self, f):
from collections import defaultdict
return defaultdict(int, {a: int(b) for a, b in self.tab2tups(f)})