You are on page 1of 2

#coding: utf-8

#############################
#tokenization problem
#############################
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
import tensorflow_datasets as tfds

import os

#ler o arquivo de entrada


arq1 =
tf.io.read_file("D:/Google_Drive/disciplina_RP/Curso_Tópicos/NLP/sarquivo_txt/salti
mbanco.txt")

#cria o vocabulário de palavras


tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()
some_tokens = tokenizer.tokenize(arq1.numpy())
vocabulary_set.update(some_tokens)
vocab_size = len(vocabulary_set)
print(vocab_size)
print(vocabulary_set)

#codificar as palavras com números


encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
encode_example=encoder.encode("ousávamos dizer que a praça e a Carroça eram do
povo")
print(encode_example)
encode_example1=encoder.encode(arq1.numpy())
arq2=arq1.numpy()

k=len(some_tokens)-3

train_data=[]
train_labels=[]
for i in range(k-1):
train_data.append(encode_example1[i:i+3])
train_labels.append(encode_example1[i+3])

#train_labels1=tf.convert_to_tensor(train_labels)
features, labels = (train_data, train_labels)
dataset = tf.data.Dataset.from_tensor_slices((features,labels))

sample_text, sample_labels = next(iter(dataset))


print(sample_text)
print(sample_labels)

#create a simple model


embedding_dim=16
model = keras.Sequential()
#keras.layers.InputLayer(dataset, ragged=True)
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(vocab_size, activation='softmax'))

model.compile(optimizer='adam',
loss=keras.losses.SparseCategoricalCrossentropy(),
metrics=[keras.metrics.SparseCategoricalAccuracy()])

history = model.fit(
dataset,
epochs=10,
)

#plotagem do gráfico de convergência


import matplotlib.pyplot as plt

history_dict = history.history

acc = history_dict['accuracy']
loss = history_dict['loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(12,9))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

#recuperando a camada de embedding


e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

You might also like