Quirin Würschinger, LMU Munich
Documentation: https://wuqui.github.io/bncparse/
Please visit the above website, as GitHub cannot render everything properly in the version below.
The diagram below illustrates all of the data that is currently
available. Variables that have been added to what was available from the
downloadable version of the BNC are marked with a +
prefix.
Package requirements are stored in requirements.yml
.
For development, I use a small subset of the corpus contained in
data/test
that only contains the first 10 texts.
testing = True
if testing:
path_bnc = Path('../data/test/bnc-2014-spoken')
assert path_bnc.exists()
texts_n = 10
tokens_n = 94_659
else:
path_bnc = Path('../data/bnc-2014-spoken')
assert path_bnc.exists()
texts_n = 1251
tokens_n = 11_422_615
path_corpus = Path(path_bnc / 'spoken' / 'tagged')
path_corpus_untagged = Path(path_bnc / 'spoken' / 'untagged')
path_metadata = Path(path_bnc / 'spoken' / 'metadata')
fp_meta_speakers = Path('../data/bnc-2014-spoken/spoken/metadata/bnc2014spoken-speakerdata.tsv')
fp_meta_speakers_fields = Path('../data/bnc-2014-spoken/spoken/metadata/metadata-fields-speaker.txt')
fp_meta_texts = Path('../data/bnc-2014-spoken/spoken/metadata/bnc2014spoken-textdata.tsv')
fp_meta_texts_fields = Path('../data/bnc-2014-spoken/spoken/metadata/metadata-fields-text.txt')
assert path_corpus.exists()
assert path_corpus_untagged.exists()
assert path_metadata.exists()
assert fp_meta_speakers.exists()
assert fp_meta_speakers_fields.exists()
assert fp_meta_texts.exists()
assert fp_meta_texts_fields.exists()
path_texts = list(path_corpus.glob('*.xml'))
assert len(path_texts) == texts_n
get_xml (f_path)
texts = [get_xml(path) for path in path_texts]
meta_texts_head = pd.read_csv(
fp_meta_texts_fields,
delimiter='\t',
skiprows=1,
index_col=0
)
meta_texts = pd.read_csv(
fp_meta_texts,
delimiter='\t',
names=meta_texts_head['XML tag'],
index_col=0
)
texts_tokens = []
for text in texts:
text_d = {}
text_d['text_id'] = text.get('id')
text_d['text_toks_n'] = 0
for tok in text.iter('w'):
text_d['text_toks_n'] += 1
texts_tokens.append(text_d)
texts_tokens = pd.DataFrame(texts_tokens)
texts_tokens
# reset index and call it text_id
meta_texts_merge = meta_texts.reset_index().rename(columns={'index': 'text_id'})
meta_texts = pd.merge(
left=meta_texts_merge,
right=texts_tokens,
on='text_id'
)
meta_texts
if not testing:
meta_texts.to_csv('../out/texts.csv', index=False)
utterances = []
for text in texts:
for u in text.findall('u'):
u_d = {}
u_d['text_id'] = text.get('id')
u_d['u_n'] = u.get('n')
u_d['u_who'] = u.get('who')
u_d['u_trans'] = u.get('trans')
u_d['u_whoConfidence'] = u.get('whoConfidence')
u_d['u_toks_n'] = len(list(u.iter('w')))
utterances.append(u_d)
utterances = pd.DataFrame(utterances)
utterances
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
text_id | u_n | u_who | u_trans | u_whoConfidence | u_toks_n | |
---|---|---|---|---|---|---|
0 | SN64 | 1 | S0590 | nonoverlap | high | 18 |
1 | SN64 | 2 | S0588 | nonoverlap | high | 0 |
2 | SN64 | 3 | S0590 | nonoverlap | high | 1 |
3 | SN64 | 4 | S0588 | nonoverlap | high | 9 |
4 | SN64 | 5 | S0589 | overlap | high | 7 |
... | ... | ... | ... | ... | ... | ... |
1248105 | SMHY | 261 | S0037 | overlap | high | 9 |
1248106 | SMHY | 262 | S0115 | nonoverlap | high | 2 |
1248107 | SMHY | 263 | S0037 | nonoverlap | high | 6 |
1248108 | SMHY | 264 | S0115 | nonoverlap | high | 29 |
1248109 | SMHY | 265 | S0037 | nonoverlap | high | 1 |
1248110 rows × 6 columns
if not testing:
utterances.to_csv('../out/utterances.csv', index=False)
For this, I use the untagged version of the corpus in the directory
spoken/untagged/
.
path_texts_untag = list(path_corpus_untagged.glob('*.xml'))
texts_untag = [get_xml(fp) for fp in path_texts_untag]
Limit to texts that have the word ‘request’ in the conv_type
field of
the header
preamble.
texts_untag_requests = []
for text in texts_untag:
header = text.find('header')
if header is not None:
conv_type = header.find('conv_type')
if conv_type is not None:
conv_type_text = conv_type.text
if conv_type_text is not None:
if 'request' in conv_type_text:
texts_untag_requests.append(text)
else:
continue
else:
continue
else:
continue
else:
continue
print(
f'all texts: {len(texts_untag)}',
f'texts with requests: {len(texts_untag_requests)}',
sep='\n'
)
all texts: 1251
texts with requests: 154
utterances_requests = []
for text in texts_untag_requests:
for u in text.iter('u'):
u_d = {}
u_d['text_id'] = text.get('id')
u_d['u_n'] = u.get('n')
u_d['u_who'] = u.get('who')
u_d['text'] = u.text
utterances_requests.append(u_d)
utterances_requests = pd.DataFrame(utterances_requests)
utterances_requests
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
text_id | u_n | u_who | text | |
---|---|---|---|---|
0 | SQ2W | 1 | S0439 | we have to like move out of this house |
1 | SQ2W | 2 | S0441 | no |
2 | SQ2W | 3 | S0439 | no no not move out of the house we have this v... |
3 | SQ2W | 4 | S0441 | None |
4 | SQ2W | 5 | S0439 | oh er are you here this weekend? |
... | ... | ... | ... | ... |
204613 | SJSC | 1166 | S0439 | mm |
204614 | SJSC | 1167 | S0440 | but the pension at the moment |
204615 | SJSC | 1168 | S0439 | we're not |
204616 | SJSC | 1169 | S0440 | None |
204617 | SJSC | 1170 | S0440 | ah bonjour Paris calling Paris calling |
204618 rows × 4 columns
filter out utterances without text
utterances_requests = utterances_requests[utterances_requests['text'].notna()]
randomize rows
utterances_requests = utterances_requests.sample(frac=1).reset_index(drop=True)
select first 50,000 rows
utterances_requests = utterances_requests.iloc[:50_000]
write out to out/utterances_requests_50k.csv
if not testing:
utterances_requests.to_csv(
'../out/utterances_requests_50k.csv', index=False)
meta_speakers_head = pd.read_csv(
fp_meta_speakers_fields,
delimiter='\t',
skiprows=1,
index_col=0
)
meta_speakers = pd.read_csv(
fp_meta_speakers,
delimiter='\t',
names=meta_speakers_head['XML tag'],
index_col=0
)
meta_speakers
speakers_toks = defaultdict(int)
for text in texts:
for u in text.iter('u'):
who = u.get('who')
n_words = len([w for w in u.iter('w')])
speakers_toks[who] += n_words
speaker_toks = pd.DataFrame(list(speakers_toks.items()), columns=['who', 'speaker_toks_n'])
speaker_toks.sort_values(by='speaker_toks_n', ascending=False).head(10)
meta_speakers_merge = meta_speakers.reset_index().rename(columns={'index': 'who'})
meta_speakers = pd.merge(
left=meta_speakers_merge,
right=speaker_toks,
on='who'
)
meta_speakers
if not testing:
meta_speakers.to_csv('../out/speakers.csv', index=False)
In addition to the metadata present in the corpus, I’ve added the following columns:
w_idx
: token position (‘index’) in the given utterance, starting at 1w_L1
: preceding tokenw_R1
: subsequent token
tokens = []
for text in texts:
tok_d = {}
tok_d['text_id'] = text.get('id')
for u in text.findall('u'):
tok_d['u_n'] = u.get('n')
u_toks = list(u.iter('w'))
for i, w in enumerate(u_toks):
tok_d['w_pos'] = w.get('pos')
tok_d['w_lemma'] = w.get('lemma')
tok_d['w_class'] = w.get('class')
tok_d['w_usas'] = w.get('usas')
tok_d['w_text'] = w.text
tok_d['w_idx'] = i + 1
tok_d['w_L1'] = u_toks[i-1].text if i > 0 else '<s>'
tok_d['w_R1'] = u_toks[i+1].text if i < len(u_toks) - 1 else '</s>'
tokens.append(tok_d.copy())
tokens = pd.DataFrame(tokens)
tokens.head(20)
assert len(tokens) == tokens_n
I export the full token table to tokens.csv
.
if not testing:
tokens.to_csv('../out/tokens.csv', index=False)
I also export a smaller version for use in spreadsheet software. This
version contains the first 50,000 tokens in the corpus and is stored in
tokens_small.csv
.
if not testing:
(tokens
.head(50_000)
.to_csv('../out/tokens_small.csv', index=False))
tokens.info()
toks_utt = pd.merge(
tokens,
utterances,
on = ['text_id', 'u_n']
)
toks_utt.info()
toks_utt_text = pd.merge(
toks_utt,
meta_texts,
on = 'text_id'
)
toks_utt_text.info()
toks_utt_text_speakers = pd.merge(
toks_utt_text,
meta_speakers,
left_on = 'u_who',
right_on = 'who'
)
toks_utt_text_speakers.info()
if not testing:
toks_utt_text_speakers.to_csv('../out/tokens-plus-meta.csv', index=False)
print(f'number of rows: {len(toks_utt_text_speakers)}')
print(f'file size: {os.path.getsize("../out/tokens-plus-meta.csv") / 1_000_000:.2f} MB')
I also write out a small version containing the first 50,000 rows for use in spreadsheet software:
if not testing:
toks_utt_text_speakers.iloc[:50_000].to_csv(
'../out/tokens-plus-meta_small.csv', index=False)
print(f'number of rows: {len(toks_utt_text_speakers.iloc[:50_000])}')
print(f'file size: {os.path.getsize("../out/tokens-plus-meta_small.csv") / 1_000_000:.2f} MB')