Sample usage for childes

CHILDES Corpus Readers

Read the XML version of the CHILDES corpus.

Setup

>>> from nltk.test.childes_fixt import setup_module
>>> setup_module()

How to use CHILDESCorpusReader

Read the CHILDESCorpusReader class and read the CHILDES corpus saved in the nltk_data directory.

>>> import nltk
>>> from nltk.corpus.reader import CHILDESCorpusReader
>>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/')

Reading files in the Valian corpus (Valian, 1991).

>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
import pandas as pdimport numpy as npdata = pd.read_csv(r"", sep='^')max_sal = np.max(data['ЗП'])
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
from sklearn.model_selection import train_test_splittrain_data, test_data= train_test_split(data, test_size = 0.1, random_state = 42)train_data.to_csv(r'', sep='^', index=False)test_data.to_csv(r'', sep='^', index=False)train_data = pd.read_csv(r"", sep='^')
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
import nltkimport renltk.download("stopwords") # поддерживает удаление стоп-словnltk.download('punkt') # делит текст на список предложенийnltk.download('wordnet') # проводит лемматизациюfrom nltk.corpus import stopwords
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
def tokenize(sentence):    #удаляем неалфавитные символы    text = re.sub("[^а-яА-Яa-zA-Z]"," ",sentence.lower())     # токенизируем слова    text = nltk.word_tokenize(text, language = "russian")    # лемматирзируем слова    text = [lemmatize.lemmatize(word) for word in text if not word in set(russian_stopwords)]    return textrussian_stopwords = stopwords.words("russian")token_text = []lemmatize = nltk.WordNetLemmatizer()for sentence in data['Описание']:    token_text.append(tokenize(sentence))
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
from gensim.models import Word2VecWord2Vec_model = Word2Vec(sentences=token_text, vector_size=600, window=5, min_count=5, workers=6)Word2Vec_model.save(r"")from gensim.models import Word2VecWord2Vec_model = Word2Vec(sentences=token_text, vector_size=600, window=5, min_count=5, workers=6)Word2Vec_model.save(r"")
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
import numpy as npfrom gensim.models import Word2VecWord2Vec_model = Word2Vec.load(r"")def vectorize(sentence):    words_vecs = [Word2Vec_model.wv[word] for word in sentence if word in Word2Vec_model.wv]    if len(words_vecs) == 0:        return np.zeros(600)    words_vecs = np.array(words_vecs)    return words_vecs.mean(axis=0)
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
vectorize_text = []for sentence in train_data['Описание']:    token = (tokenize(sentence))       vectorize_text.append(vectorize(token))
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
from ast import literal_evalskills_binary = []for i in train_data['Навыки']:    binr = [0]*128    for j in literal_eval(i):        binr[int(j)] = 1    skills_binary.append(binr)
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
train_bin_sal = []for i in train_data['ЗП']:    train_bin_sal.append(i/max_sal)
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
from tensorflow.keras.models import Sequentialfrom tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1Dfrom gensim.models import Word2Vecfrom tensorflow.keras.optimizers import AdamWord2Vec_model = Word2Vec.load(r"")num_skills = 128embedding_dim=256vocab_size = len(Word2Vec_model.wv)model = Sequential()# Define input layermodel.add(Input(shape=(None,), dtype='float32'))  # None allows for variable sequence lengthsmodel.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))model.add(GlobalAveragePooling1D())model.add(Dense(512, activation='relu'))model.add(Dense(num_skills, activation='sigmoid'))opt = Adam(learning_rate=0.00005)model.compile(optimizer=opt, loss='binary_crossentropy',                metrics=['binary_accuracy'], )# Summarymodel.summary()
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
from tensorflow.keras.models import Sequentialfrom tensorflow.keras.layers import Input, Dense, Embedding, GlobalAveragePooling1Dfrom gensim.models import Word2Vecfrom tensorflow.keras.optimizers import AdamWord2Vec_model = Word2Vec.load(r"")embedding_dim=128vocab_size = len(Word2Vec_model.wv)sal_model = Sequential()# Define input layersal_model.add(Input(shape=(None,), dtype='float32'))  # None allows for variable sequence lengthssal_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))sal_model.add(GlobalAveragePooling1D())sal_model.add(Dense(128, activation='relu'))sal_model.add(Dense(32, activation='relu'))sal_model.add(Dense(8, activation='relu'))sal_model.add(Dense(1, activation='sigmoid'))opt = Adam(learning_rate=0.0001)sal_model.compile(optimizer=opt, loss='binary_crossentropy',                metrics=['accuracy', 'binary_accuracy'], )# Summarysal_model.summary()
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
model.fit(np.array(vectorize_text), np.array(skills_binary), batch_size=32, epochs=20)sal_model.fit(np.array(vectorize_text), np.array(train_bin_sal), batch_size=32, epochs=5)model.save(r'')sal_model.save(r'')
>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> valian.fileids()
import tensorflow as tffrom sklearn.metrics import classification_reportloaded_model = tf.keras.models.load_model(r"")# Example sequencesloaded_sal_model = tf.keras.models.load_model(r"")# Example sequencesk = -7sequences = [vectorize_text[k]]  # List of arrays of random lengths between 20 and 40# Pad sequencespadded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding='post', dtype='float32')predictions  = loaded_model.predict(padded_sequences)salo = loaded_sal_model.predict(padded_sequences)[0][0]# Predict# print("Predictions:", predictions[0][0][0])# print(classification_report(np.where(predictions[0] > 0.5, 1, 0), skills_binary[-3]))print("salooooo:", round(train_bin_sal[k]*max_sal))print("salooooo:", round(salo*max_sal))print("Predictions:", np.where(predictions[0] > 0.2, 1, 0))print("Predictions:", np.array(skills_binary[k]))

Count the number of files

>>> len(valian.fileids())
43

Printing properties of the corpus files.

>>> corpus_data = valian.corpus(valian.fileids())
>>> print(corpus_data[0]['Lang'])
eng
>>> for key in sorted(corpus_data[0].keys()):
...    print(key, ": ", corpus_data[0][key])
Corpus :  valian
Date :  1986-03-04
Id :  01a
Lang :  eng
Version :  2.0.1
model = Sequential()# Define input layermodel.add(Input(shape=(600,), dtype='float32'))  # None allows for variable sequence lengthsmodel.add(Dense(450, activation='relu'))model.add(Dropout(0.2))model.add(Dense(325, activation='relu'))model.add(Dropout(0.2))model.add(Dense(250, activation='relu'))model.add(Dropout(0.2))model.add(Dense(200, activation='relu'))model.add(Dense(num_skills, activation='sigmoid'))rlrop = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=4, min_lr=0.0005)opt = Adam()model.compile(optimizer=opt, loss='binary_crossentropy',                metrics=['accuracy'])# Summarymodel.summary()

Printing information of participants of the corpus. The most common codes for the participants are ‘CHI’ (target child), ‘MOT’ (mother), and ‘INV’ (investigator).

>>> corpus_participants = valian.participants(valian.fileids())
>>> for this_corpus_participants in corpus_participants[:2]:
...     for key in sorted(this_corpus_participants.keys()):
...         dct = this_corpus_participants[key]
...         print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
CHI :  [('age', 'P2Y1M3D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')]
INV :  [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')]
MOT :  [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')]
CHI :  [('age', 'P2Y1M12D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')]
INV :  [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')]
history = model.fit(np.array(vectorize_text), np.array(skills_binary), batch_size=32, epochs=20, callbacks=rlrop, validation_data=(np.array(test_vectorize_text), np.array(test_skills_binary)))

printing words.

>>> valian.words('Valian/01a.xml')
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...

printing sentences.

>>> valian.sents('Valian/01a.xml')
[['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname',
  'and', 'it', 'is', 'March', 'fourth', 'I', 'believe', 'and', 'when',
  'was', "Parent's", 'birthday'], ["Child's"], ['oh', "I'm", 'sorry'],
from matplotlib import pyplot as pltimport kerasplt.plot(history.history['accuracy'])plt.plot(history.history['val_accuracy'])plt.title('model accuracy')plt.ylabel('accuracy')plt.xlabel('epochs')plt.legend(['train', 'val'], loc='upper left')plt.show()

You can specify the participants with the argument speaker.

>>> valian.words('Valian/01a.xml',speaker=['INV'])
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ...
>>> valian.words('Valian/01a.xml',speaker=['MOT'])
["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ...
>>> valian.words('Valian/01a.xml',speaker=['CHI'])
['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',...

tagged_words() and tagged_sents() return the usual (word,pos) tuple lists. POS tags in the CHILDES are automatically assigned by MOR and POST programs (MacWhinney, 2000).

>>> valian.tagged_words('Valian/01a.xml')[:30]
[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'),
('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'),
('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'),
('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'),
('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n'), ("Child's", 'n:prop'),
('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj'), ("that's", 'pro:dem'),
('okay', 'adj'), ('February', 'n:prop'), ('first', 'adj'),
('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')]
>>> valian.tagged_sents('Valian/01a.xml')[:10]
[[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'),
('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'),
('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'),
('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'),
('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n')],
[("Child's", 'n:prop')], [('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj')],
[("that's", 'pro:dem'), ('okay', 'adj')],
[('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'),
('eighty', 'det:num'), ('four', 'det:num')],
[('great', 'adj')],
[('and', 'coord'), ("she's", 'pro:sub'), ('two', 'det:num'), ('years', 'n'), ('old', 'adj')],
[('correct', 'adj')],
[('okay', 'co')], [('she', 'pro:sub'), ('just', 'adv:int'), ('turned', 'part'), ('two', 'det:num'),
('a', 'det'), ('month', 'n'), ('ago', 'adv')]]

When the argument stem is true, the word stems (e.g., ‘is’ -> ‘be-3PS’) are used instead of the original words.

>>> valian.words('Valian/01a.xml')[:30]
['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', ...
>>> valian.words('Valian/01a.xml',stem=True)[:30]
['at', 'Parent', 'Lastname', 's', 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'be-3S', ...

When the argument replace is true, the replaced words are used instead of the original words.

>>> valian.words('Valian/01a.xml',speaker='CHI')[247]
'tikteat'
>>> valian.words('Valian/01a.xml',speaker='CHI',replace=True)[247]
'trick'

When the argument relation is true, the relational relationships in the sentence are returned. See Sagae et al. (2010) for details of the relational structure adopted in the CHILDES.

>>> valian.words('Valian/01a.xml',relation=True)[:10]
[[('at', 'prep', '1|0|ROOT'), ('Parent', 'n', '2|5|VOC'), ('Lastname', 'n', '3|5|MOD'), ('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|1|JCT'), ('Child', 'n', '7|8|NAME'), ('Lastname', 'n', '8|6|POBJ'), ('and', 'coord', '9|8|COORD'), ('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v', '11|9|COMP'), ('March', 'n', '12|11|PRED'), ('fourth', 'adj', '13|12|MOD'), ('I', 'pro', '15|16|SUBJ'), ('believe', 'v', '16|14|ROOT'), ('and', 'coord', '18|17|ROOT'), ('when', 'adv', '19|20|PRED'), ('be-PAST', 'v', '20|18|COMP'), ('Parent', 'n', '21|23|MOD'), ('s', 'poss', '22|23|MOD'), ('birth', 'n', '23|20|SUBJ')], [('Child', 'n', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|4|COM'), ('I', 'pro', '3|4|SUBJ'), ('be', 'v', '4|0|ROOT'), ('sorry', 'adj', '5|4|PRED')], [('that', 'pro', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')], [('February', 'n', '1|6|VOC'), ('first', 'adj', '2|6|ENUM'), ('nineteen', 'det', '4|6|ENUM'), ('eighty', 'det', '5|6|ENUM'), ('four', 'det', '6|0|ROOT')], [('great', 'adj', '1|0|ROOT')], [('and', 'coord', '1|0|ROOT'), ('she', 'pro', '2|1|ROOT'), ('be', 'aux', '3|5|AUX'), ('two', 'det', '4|5|QUANT'), ('year-PL', 'n', '5|2|ROOT'), ('old', 'adj', '6|5|MOD')], [('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|0|ROOT'), ('just', 'adv', '2|3|JCT'), ('turn-PERF', 'part', '3|1|XCOMP'), ('two', 'det', '4|6|QUANT'), ('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]]

Printing age. When the argument month is true, the age information in the CHILDES format is converted into the number of months.

>>> valian.age()
['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ...
>>> valian.age('Valian/01a.xml')
['P2Y1M3D']
>>> valian.age('Valian/01a.xml',month=True)
[25]

Printing MLU. The criteria for the MLU computation is broadly based on Brown (1973).

>>> valian.MLU()
[2.3574660633484..., 2.292682926829..., 3.492857142857..., 2.961783439490...,
 2.0842696629213..., 3.169811320754..., 3.137404580152..., 3.0578034682080...,
 4.090163934426..., 3.488372093023..., 2.8773584905660..., 3.4792899408284...,
 4.0111940298507..., 3.456790123456..., 4.487603305785..., 4.007936507936...,
 5.25, 5.154696132596..., ...]
>>> valian.MLU('Valian/01a.xml')
[2.35746606334...]

Basic stuff

Count the number of words and sentences of each file.

>>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml')
>>> for this_file in valian.fileids()[:6]:
...     print(valian.corpus(this_file)[0]['Corpus'], valian.corpus(this_file)[0]['Id'])
...     print("num of words: %i" % len(valian.words(this_file)))
...     print("num of sents: %i" % len(valian.sents(this_file)))
valian 01a
num of words: 3606
num of sents: 1027
valian 01b
num of words: 4376
num of sents: 1274
valian 02a
num of words: 2673
num of sents: 801
valian 02b
num of words: 5020
num of sents: 1583
valian 03a
num of words: 2743
num of sents: 988
valian 03b
num of words: 4409
num of sents: 1397