Created
February 20, 2026 19:16
-
-
Save robintux/8406665449cb7977e438c6a3a1d47dab to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| from collections import Counter | |
| import matplotlib.pyplot as plt | |
| from sklearn.decomposition import PCA | |
| class Word2VecSimple: | |
| """ | |
| Implementación simplificada de Word2Vec (Skip-gram) para fines educativos. | |
| """ | |
| def __init__(self, embedding_dim=50, window_size=2, learning_rate=0.025): | |
| self.embedding_dim = embedding_dim | |
| self.window_size = window_size | |
| self.learning_rate = learning_rate | |
| self.word2idx = {} | |
| self.idx2word = {} | |
| self.W1 = None # Matriz de entrada (V x N) | |
| self.W2 = None # Matriz de salida (N x V) | |
| def construir_vocabulario(self, corpus): | |
| """Construye el mapeo palabra-índice""" | |
| vocab = Counter() | |
| for oracion in corpus: | |
| vocab.update(oracion) | |
| # Palabras únicas | |
| palabras_unicas = list(vocab.keys()) | |
| self.word2idx = {palabra: idx for idx, palabra in enumerate(palabras_unicas)} | |
| self.idx2word = {idx: palabra for palabra, idx in self.word2idx.items()} | |
| self.vocab_size = len(self.word2idx) | |
| # Inicializar matrices de pesos aleatoriamente | |
| self.W1 = np.random.uniform(-0.5, 0.5, (self.vocab_size, self.embedding_dim)) | |
| self.W2 = np.random.uniform(-0.5, 0.5, (self.embedding_dim, self.vocab_size)) | |
| print(f"Vocabulario construido: {self.vocab_size} palabras") | |
| def generar_pares_skipgram(self, corpus): | |
| """Genera pares (palabra_central, palabra_contexto)""" | |
| pares = [] | |
| for oracion in corpus: | |
| for i, palabra_central in enumerate(oracion): | |
| if palabra_central not in self.word2idx: | |
| continue | |
| # Ventana de contexto | |
| inicio = max(0, i - self.window_size) | |
| fin = min(len(oracion), i + self.window_size + 1) | |
| for j in range(inicio, fin): | |
| if i != j and oracion[j] in self.word2idx: | |
| pares.append((palabra_central, oracion[j])) | |
| return pares | |
| def softmax(self, x): | |
| """Función softmax estable numéricamente""" | |
| exp_x = np.exp(x - np.max(x)) | |
| return exp_x / exp_x.sum() | |
| def entrenar(self, corpus, epochs=100): | |
| """Entrena el modelo usando Skip-gram con SGD""" | |
| self.construir_vocabulario(corpus) | |
| pares = self.generar_pares_skipgram(corpus) | |
| print(f"Pares de entrenamiento generados: {len(pares)}") | |
| losses = [] | |
| for epoch in range(epochs): | |
| loss_epoch = 0 | |
| for palabra_central, palabra_contexto in pares: | |
| # One-hot encoding | |
| idx_central = self.word2idx[palabra_central] | |
| idx_contexto = self.word2idx[palabra_contexto] | |
| x = np.zeros(self.vocab_size) | |
| x[idx_central] = 1 | |
| # Forward pass | |
| h = np.dot(x, self.W1) # Capa oculta (embedding) | |
| u = np.dot(h, self.W2) # Scores antes de softmax | |
| y_pred = self.softmax(u) | |
| # Calcular loss (cross-entropy) | |
| loss_epoch += -np.log(y_pred[idx_contexto] + 1e-10) | |
| # Backward pass (gradientes) | |
| e = y_pred.copy() | |
| e[idx_contexto] -= 1 # Error | |
| # Actualizar pesos | |
| dW2 = np.outer(h, e) | |
| dW1 = np.outer(x, np.dot(self.W2, e)) | |
| self.W2 -= self.learning_rate * dW2 | |
| self.W1 -= self.learning_rate * dW1 | |
| avg_loss = loss_epoch / len(pares) | |
| losses.append(avg_loss) | |
| if epoch % 20 == 0: | |
| print(f"Época {epoch}: Loss = {avg_loss:.4f}") | |
| return losses | |
| def obtener_embedding(self, palabra): | |
| """Obtiene el vector embedding de una palabra""" | |
| if palabra in self.word2idx: | |
| idx = self.word2idx[palabra] | |
| return self.W1[idx] | |
| return None | |
| def palabras_similares(self, palabra, top_n=5): | |
| """Encuentra las palabras más similares""" | |
| if palabra not in self.word2idx: | |
| return [] | |
| vector_objetivo = self.obtener_embedding(palabra) | |
| similitudes = [] | |
| for otra_palabra, idx in self.word2idx.items(): | |
| if otra_palabra == palabra: | |
| continue | |
| vector_otro = self.W1[idx] | |
| # Similitud coseno | |
| sim = np.dot(vector_objetivo, vector_otro) / ( | |
| np.linalg.norm(vector_objetivo) * np.linalg.norm(vector_otro) + 1e-10 | |
| ) | |
| similitudes.append((otra_palabra, sim)) | |
| similitudes.sort(key=lambda x: x[1], reverse=True) | |
| return similitudes[:top_n] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment