
Recorrido completo donde tome nuestro conjunto de datos anotado mediante programación de una publicación anterior y utilícelo para ajustar un transformador.
El proyecto completo se puede encontrar en este repositorio.
Datos
Usaremos el mismo conjunto de datos (ents — iob.json
) que creamos anteriormente en la creación de modelos personalizados de reconocimiento de entidades con nombre.
[
{
"tokens": ["(", "6:51", "-", "1st", ")", "(", "Shotgun", ")", "P.Mahomes", "scrambles", "right", "end", "to", "LAC", "34", "for", "2", "yards", "(", "S.Joseph", ";", "K.Van", "Noy", ")", ".", "FUMBLES", "(", "S.Joseph", ")", ",", "and", "recovers", "at", "LAC", "34", "."],
"labels": ["O", "B-TIME", "O", "B-PERIOD", "O", "O", "B-FORMATION", "O", "B-PLAYER", "B-EVENT", "B-DIRECTION", "O", "O", "B-TEAM", "B-QUANTITY", "O", "B-QUANTITY", "O", "O", "B-PLAYER", "O", "B-PLAYER", "I-PLAYER", "O", "O", "O", "O", "B-PLAYER", "O", "O", "O", "O", "O", "B-TEAM", "B-QUANTITY", "O"]}
},
...
]
bibliotecas
pip install extr-ds
pip install tensorflow
pip install transformers
pip install datasets
pip install evaluate
pip install seqeval
Configuración
Estaremos afinando el bert-base-case control.
epochs = 15
model_checkpoint = 'bert-base-cased'
model_output_checkpoint = 'transformers/nfl_pbp_token_classifier'entity_groups = [
'TIME',
'PERIOD',
'TEAM',
'PLAYER',
'POSITION',
'FORMATION',
'EVENT',
'DIRECTION',
'QUANTITY'
]
labels = ['O'] + \
[f'B-{label}' for label in entity_groups] + \
[f'I-{label}' for label in entity_groups]
label2id = { label:i for i, label in enumerate(labels) }
id2label = { i:label for i, label in enumerate(labels) }
Formatear conjunto de datos
El conjunto de datos original utilizado nltk.tokenize.word_tokenize
para tokenizar el texto. El tokenizador del transformador divide las palabras en subtokens, por lo que la transición es un poco incómoda. El align_labels
El método ayuda a extender nuestro etiquetado a estos sub-tokens como I-<entity_group>
. Esto nos permitirá utilizar la aggregation_strategy=’simple’
opción más adelante al extraer entidades.
import os
import random
from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from extr_ds.manager.utils.filesystem import load_documentdef align_labels(tokenized_inputs, label_list):
labels = []
for word_idx in tokenized_inputs.word_ids(batch_index=0):
label_id = -100
if not word_idx is None:
label = re.sub(r'^[BI]-(.+)$', r'I-\g<1>', label_list[word_idx]) \
if word_idx == previous_word_idx \
else label_list[word_idx]
label_id = label2id[label]
labels.append(label_id)
previous_word_idx = word_idx
return labels
def get_dataset(tokenizer, model):
def tokenize_and_align_labels(record):
tokenized_inputs = tokenizer(
record['tokens'],
truncation=True,
is_split_into_words=True
)
tokenized_inputs['labels'] = align_labels(
tokenized_inputs,
record['labels']
)
return tokenized_inputs
ents_dataset = json.loads(
load_document(os.path.join('4', 'ents-iob.json'))
)
random.shuffle(ents_dataset)
pivot = int(len(ents_dataset) * .8)
data_collator = DataCollatorForTokenClassification(
tokenizer,
return_tensors='tf'
)
train_dataset = Dataset.from_list(ents_dataset[:pivot])
tf_train_set = model.prepare_tf_dataset(
train_dataset.map(
tokenize_and_align_labels,
batched=False
),
shuffle=True,
collate_fn=data_collator,
)
test_dataset = Dataset.from_list(ents_dataset[pivot:])
tf_test_set = model.prepare_tf_dataset(
test_dataset.map(
tokenize_and_align_labels,
batched=False
),
shuffle=True,
collate_fn=data_collator,
)
return tf_train_set, tf_test_set
Métrica
import numpy
import evaluate
from transformers.keras_callbacks import KerasMetricCallbackseqeval = evaluate.load('seqeval')
def compute_metrics(preds):
predictions, actuals = preds
predictions = numpy.argmax(predictions, axis=2)
results = seqeval.compute(
predictions=[
[labels[p] for p, l in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, actuals)
],
references=[
[labels[l] for p, l in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, actuals)
]
)
return {
key: results[f'overall_{key}']
for key in ['precision', 'recall', 'f1', 'accuracy']
}
callbacks = [
KerasMetricCallback(
metric_fn=compute_metrics,
eval_dataset=tf_test_set
),
tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
]
Modelo
La tasa de aprendizaje predeterminada del optimizador de Adam es demasiado alta. Establecerlo más bajo ayuda con la convergencia.
import tensorflow as tf
from transformers import AutoTokenizer, \
TFAutoModelForTokenClassificationtokenizer = AutoTokenizer.from_pretrained(
model_checkpoint
)
model = TFAutoModelForTokenClassification.from_pretrained(
model_checkpoint,
num_labels=len(labels),
id2label=id2label,
label2id=label2id
)
tf_train_set, tf_test_set = get_dataset(tokenizer, model)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer)
model.fit(
x=tf_train_set,
validation_data=tf_test_set,
epochs=epochs,
callbacks=callbacks
)
Guardar modelo
for model_to_save in [tokenizer, model]:
model_to_save.save_pretrained(model_output_checkpoint)
Invocación de modelo personalizado
from transformers import pipelineclassifier = pipeline(
'ner',
model=model_output_checkpoint,
aggregation_strategy='simple'
)
examples = [
'(6:51 - 1st) (Shotgun) P.Mahomes scrambles right end to LAC 34 for 2 yards (S.Joseph; K.Van Noy). FUMBLES (S.Joseph), and recovers at LAC 34.',
]
responses = classifier(examples)
print(responses)
El aggregation_strategy='simple'
opción combinará nuestra B / I etiquetas en nuestro entity_groups
Como se muestra abajo. Consulte Convertir la salida de inferencia del transformador de nuevo al formato IOB2 para convertir la respuesta al formato IOB2.
[
{'entity_group': 'TIME', 'score': 0.9888856, 'word': '6 : 51', 'start': 1, 'end': 5},
{'entity_group': 'PERIOD', 'score': 0.9887093, 'word': '1st', 'start': 8, 'end': 11},
{'entity_group': 'FORMATION', 'score': 0.98260975, 'word': 'Shotgun', 'start': 14, 'end': 21},
{'entity_group': 'PLAYER', 'score': 0.9936474, 'word': 'P. Mahomes', 'start': 23, 'end': 32},
{'entity_group': 'EVENT', 'score': 0.69440436, 'word': 'scrambles', 'start': 33, 'end': 42},
{'entity_group': 'DIRECTION', 'score': 0.88298887, 'word': 'right', 'start': 43, 'end': 48},
{'entity_group': 'TEAM', 'score': 0.97735167, 'word': 'LAC', 'start': 56, 'end': 59},
{'entity_group': 'QUANTITY', 'score': 0.9734075, 'word': '34', 'start': 60, 'end': 62},
{'entity_group': 'QUANTITY', 'score': 0.9110169, 'word': '2', 'start': 67, 'end': 68},
{'entity_group': 'PLAYER', 'score': 0.9935433, 'word': 'S. Joseph', 'start': 76, 'end': 84},
{'entity_group': 'PLAYER', 'score': 0.9919572, 'word': 'K. Van Noy', 'start': 86, 'end': 95},
{'entity_group': 'PLAYER', 'score': 0.9934915, 'word': 'S. Joseph', 'start': 107, 'end': 115},
{'entity_group': 'TEAM', 'score': 0.97411484, 'word': 'LAC', 'start': 134, 'end': 137},
{'entity_group': 'QUANTITY', 'score': 0.9710606, 'word': '34', 'start': 138, 'end': 140}
]