Skip to content

Instantly share code, notes, and snippets.

@robintux
Created February 20, 2026 19:16
Show Gist options
  • Select an option

  • Save robintux/8406665449cb7977e438c6a3a1d47dab to your computer and use it in GitHub Desktop.

Select an option

Save robintux/8406665449cb7977e438c6a3a1d47dab to your computer and use it in GitHub Desktop.
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
class Word2VecSimple:
"""
Implementación simplificada de Word2Vec (Skip-gram) para fines educativos.
"""
def __init__(self, embedding_dim=50, window_size=2, learning_rate=0.025):
self.embedding_dim = embedding_dim
self.window_size = window_size
self.learning_rate = learning_rate
self.word2idx = {}
self.idx2word = {}
self.W1 = None # Matriz de entrada (V x N)
self.W2 = None # Matriz de salida (N x V)
def construir_vocabulario(self, corpus):
"""Construye el mapeo palabra-índice"""
vocab = Counter()
for oracion in corpus:
vocab.update(oracion)
# Palabras únicas
palabras_unicas = list(vocab.keys())
self.word2idx = {palabra: idx for idx, palabra in enumerate(palabras_unicas)}
self.idx2word = {idx: palabra for palabra, idx in self.word2idx.items()}
self.vocab_size = len(self.word2idx)
# Inicializar matrices de pesos aleatoriamente
self.W1 = np.random.uniform(-0.5, 0.5, (self.vocab_size, self.embedding_dim))
self.W2 = np.random.uniform(-0.5, 0.5, (self.embedding_dim, self.vocab_size))
print(f"Vocabulario construido: {self.vocab_size} palabras")
def generar_pares_skipgram(self, corpus):
"""Genera pares (palabra_central, palabra_contexto)"""
pares = []
for oracion in corpus:
for i, palabra_central in enumerate(oracion):
if palabra_central not in self.word2idx:
continue
# Ventana de contexto
inicio = max(0, i - self.window_size)
fin = min(len(oracion), i + self.window_size + 1)
for j in range(inicio, fin):
if i != j and oracion[j] in self.word2idx:
pares.append((palabra_central, oracion[j]))
return pares
def softmax(self, x):
"""Función softmax estable numéricamente"""
exp_x = np.exp(x - np.max(x))
return exp_x / exp_x.sum()
def entrenar(self, corpus, epochs=100):
"""Entrena el modelo usando Skip-gram con SGD"""
self.construir_vocabulario(corpus)
pares = self.generar_pares_skipgram(corpus)
print(f"Pares de entrenamiento generados: {len(pares)}")
losses = []
for epoch in range(epochs):
loss_epoch = 0
for palabra_central, palabra_contexto in pares:
# One-hot encoding
idx_central = self.word2idx[palabra_central]
idx_contexto = self.word2idx[palabra_contexto]
x = np.zeros(self.vocab_size)
x[idx_central] = 1
# Forward pass
h = np.dot(x, self.W1) # Capa oculta (embedding)
u = np.dot(h, self.W2) # Scores antes de softmax
y_pred = self.softmax(u)
# Calcular loss (cross-entropy)
loss_epoch += -np.log(y_pred[idx_contexto] + 1e-10)
# Backward pass (gradientes)
e = y_pred.copy()
e[idx_contexto] -= 1 # Error
# Actualizar pesos
dW2 = np.outer(h, e)
dW1 = np.outer(x, np.dot(self.W2, e))
self.W2 -= self.learning_rate * dW2
self.W1 -= self.learning_rate * dW1
avg_loss = loss_epoch / len(pares)
losses.append(avg_loss)
if epoch % 20 == 0:
print(f"Época {epoch}: Loss = {avg_loss:.4f}")
return losses
def obtener_embedding(self, palabra):
"""Obtiene el vector embedding de una palabra"""
if palabra in self.word2idx:
idx = self.word2idx[palabra]
return self.W1[idx]
return None
def palabras_similares(self, palabra, top_n=5):
"""Encuentra las palabras más similares"""
if palabra not in self.word2idx:
return []
vector_objetivo = self.obtener_embedding(palabra)
similitudes = []
for otra_palabra, idx in self.word2idx.items():
if otra_palabra == palabra:
continue
vector_otro = self.W1[idx]
# Similitud coseno
sim = np.dot(vector_objetivo, vector_otro) / (
np.linalg.norm(vector_objetivo) * np.linalg.norm(vector_otro) + 1e-10
)
similitudes.append((otra_palabra, sim))
similitudes.sort(key=lambda x: x[1], reverse=True)
return similitudes[:top_n]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment