Skip to content

Instantly share code, notes, and snippets.

View bushaev's full-sized avatar

Vitaly Bushaev bushaev

  • ITMO University
  • Saint Petersburg
View GitHub Profile
for t in range(num_iterations):
g = compute_gradient(x, y)
m = beta_1 * m + (1 - beta_1) * g
v = beta_2 * v + (1 - beta_2) * np.power(g, 2)
v_hat = np.maximum(v, v_hat)
w = w - step_size * m / (np.sqrt(v_hat) + epsilon)
@bushaev
bushaev / Nadam.py
Last active October 22, 2018 12:49
for t in range(num_iterations):
g = compute_gradient(x, y)
m = beta_1 * m + (1 - beta_1) * g
v = beta_2 * v + (1 - beta_2) * np.power(g, 2)
m_hat = m / (1 - np.power(beta_1, t)) + (1 - beta_1) * g / (1 - np.power(beta_1, t))
v_hat = v / (1 - np.power(beta_2, t))
w = w - step_size * m_hat / (np.sqrt(v_hat) + epsilon)
@bushaev
bushaev / Adamax.py
Last active October 22, 2018 12:49
for t in range(num_iterations):
g = compute_gradient(x, y)
m = beta_1 * m + (1 - beta_1) * g
m_hat = m / (1 - np.power(beta_1, t))
v = np.maximum(beta_2 * v, np.abs(g))
w = w - step_size * m_hat / v
@bushaev
bushaev / Adamax.py
Last active October 22, 2018 12:49
for t in range(num_iterations):
g = compute_gradient(x, y)
m = beta_1 * m + (1 - beta_1) * g
m_hat = m / (1 - np.power(beta_1, t))
v = np.maximum(beta_2 * v, np.abs(g))
w = w - step_size * m / v
@bushaev
bushaev / Adam.py
Last active October 22, 2018 12:49
for t in range(num_iterations):
g = compute_gradient(x, y)
m = beta_1 * m + (1 - beta_1) * g
v = beta_2 * v + (1 - beta_2) * np.power(g, 2)
m_hat = m / (1 - np.power(beta_1, t))
v_hat = v / (1 - np.power(beta_2, t))
w = w - step_size * m_hat / (np.sqrt(v_hat) + epsilon)
grads_squared = 0
for _ in num_iterations:
dw = compute_gradient(x, y)
grad_squared += dw * dw
w = w - (lr / np.sqrt(grad_squared)) * dw
grads_squared = 0
for _ in num_iterations:
dw = compute_gradient(x, y_
grad_squared += dw * dw
w = w - (lr / np.sqrt(grad_squared)) * dw
drad_squared = 0
for _ in num_iterations:
dw = compute_gradients(x, y)
grad_squared = 0.9 * grads_squared + 0.1 * dx * dx
w = w - (lr / np.sqrt(grad_squared)) * dw
@bushaev
bushaev / rprop.py
Created September 1, 2018 17:50
simplest form of rprop update rule
for t in range(num_interations):
dw[t] = compute_gradient(x, y)
if dw[t] * dw[t - 1] > 0:
step_size = min(step_size * incFactor, step_size_max)
elif dw[t] * dw[t - 1] < 0:
step_size = max(step_size * decFactor, step_size_min)
w[t] = w[t - 1] - sign(dw[t]) * step_size
for t in range(num_interations):
dw[t] = compute_gradient(x, y)
if dw[t] * dw[t - 1] > 0:
step_size = min(step_size * incFactor, step_size_max)
elif dw[t] * dw[t - 1] < 0:
step_size = max(step_size * decFactor, step_size_min)
w[t] = w[t - 1] - sign(dw[t]) * step_size