AReaL

Share link

input_ids ["123+456=?", "The", "answer", "is", "579", "."]
logprobs [-0.12, -0.41, -0.35, -0.09, -0.22]
loss_mask 0, 0, 0, 1, 1, 1, 1, 1
rewards 1.0
advantages [0.12, 0.34, 0.56, 0.78, 0.91]
versions ["v2", "v2", "v2", "v2", "v2"]

GPU 0
GPU 1
GPU 2
GPU 3
Util: 0%
GPU 0
GPU 1
GPU 2
GPU 3
Util: 0%

Architecture paths

⚡ Weight Update!
P1
P2
P3

Nr1Bi+η\left\lfloor \frac{N_r - 1}{B} \right\rfloor \leq i + \eta

42.0 -
1 41.8 42.1
42.2
35.7 41.0
34.0 36.9

JPPO(θ)=E[min(ut(θ)A^t,  clip(ut(θ),1ϵ,1+ϵ)A^t)]J_{\text{PPO}}(\theta) = \mathbb{E}\left[\min\left(u_t(\theta) \cdot \hat{A}_t,\; \text{clip}(u_t(\theta),\, 1{-}\epsilon,\, 1{+}\epsilon) \cdot \hat{A}_t\right)\right] where ut(θ)=πθ(atst)πold(atst)\text{where } u_t(\theta) = \frac{\pi_\theta(a_t | s_t)}{\pi_{\text{old}}(a_t | s_t)}

J(θ)=E[tπproxπbehavmin(utproxA^t,  clip(utprox,1ϵ,1+ϵ)A^t)]J(\theta) = \mathbb{E}\left[\sum_t \frac{\textcolor{#27ae60}{\pi_{\text{prox}}}}{\textcolor{#c0392b}{\pi_{\text{behav}}}} \cdot \min\left(u_t^{\text{prox}} \cdot \hat{A}_t,\; \text{clip}(u_t^{\text{prox}},\, 1{-}\epsilon,\, 1{+}\epsilon) \cdot \hat{A}_t\right)\right] where utprox(θ)=πθ(atst)πprox(atst)\text{where } u_t^{\text{prox}}(\theta) = \frac{\pi_\theta(a_t | s_t)}{\textcolor{#27ae60}{\pi_{\text{prox}}}(a_t | s_t)}

JPPO(θ)=E[min(ut(θ)A^t,  clip(ut(θ),1ϵ,1+ϵ)A^t)]J_{\text{PPO}}(\theta) = \mathbb{E}\left[\min\left(u_t(\theta) \cdot \hat{A}_t,\; \text{clip}(u_t(\theta),\, 1{-}\epsilon,\, 1{+}\epsilon) \cdot \hat{A}_t\right)\right] where ut(θ)=πθ(atst)πold(atst)\text{where } u_t(\theta) = \frac{\pi_\theta(a_t | s_t)}{\pi_{\text{old}}(a_t | s_t)}

J(θ)=E[tπproxπbehavmin(utproxA^t,  clip(utprox,1ϵ,1+ϵ)A^t)]J(\theta) = \mathbb{E}\left[\sum_t \frac{\textcolor{#27ae60}{\pi_{\text{prox}}}}{\textcolor{#c0392b}{\pi_{\text{behav}}}} \cdot \min\left(u_t^{\text{prox}} \cdot \hat{A}_t,\; \text{clip}(u_t^{\text{prox}},\, 1{-}\epsilon,\, 1{+}\epsilon) \cdot \hat{A}_t\right)\right] where utprox(θ)=πθ(atst)πprox(atst)\text{where } u_t^{\text{prox}}(\theta) = \frac{\pi_\theta(a_t | s_t)}{\textcolor{#27ae60}{\pi_{\text{prox}}}(a_t | s_t)}

JGRPO(θ)=E[1Gi=1Gmin(ρi(θ)A^i,  clip(ρi(θ),1ϵclip,1+ϵclip)A^i)]J_{\text{GRPO}}(\theta) = \mathbb{E}\left[\frac{1}{G}\sum_{i=1}^{G}\min\left(\rho_i(\theta)\hat{A}_i,\; \text{clip}(\rho_i(\theta),\, 1{-}\epsilon_{\text{clip}},\, 1{+}\epsilon_{\text{clip}})\hat{A}_i\right)\right] ρi(θ)=πθ(oiq)πold(oiq),A^i=r~iμGσG,μG=1Gj=1Gr~j,σG=1Gj=1G(r~jμG)2+δ\rho_i(\theta)=\frac{\pi_\theta(o_i|q)}{\pi_{\text{old}}(o_i|q)},\quad \hat{A}_i = \frac{\tilde r_i - \mu_G}{\sigma_G},\quad \mu_G = \frac{1}{G}\sum_{j=1}^{G}\tilde r_j,\quad \sigma_G = \sqrt{\frac{1}{G}\sum_{j=1}^{G}(\tilde r_j-\mu_G)^2 + \delta} r~i=reward_scalingri+reward_bias\tilde r_i = \text{reward\_scaling}\cdot r_i + \text{reward\_bias}

maxθ  L(θ)=Et ⁣[πθ(atst)πθold(atst)A^t]s.t.DKL(πθoldπθ)δ\max_\theta \; L(\theta) = \mathbb{E}_t\!\left[\frac{\pi_\theta(a_t|s_t)}{\pi_{\theta_{\text{old}}}(a_t|s_t)} \hat{A}_t\right] \quad \text{s.t.} \quad \overline{D}_{\text{KL}}(\pi_{\theta_{\text{old}}} \| \pi_\theta) \le \delta

θk+1=θk+2δgF1g  F1g\theta_{k+1} = \theta_k + \sqrt{\frac{2\delta}{\mathbf{g}^\top \mathbf{F}^{-1} \mathbf{g}}} \; \mathbf{F}^{-1} \mathbf{g} where g=θL(θ)θk,F=θ2DKL(πθkπθ)θk\text{where } \mathbf{g} = \nabla_\theta L(\theta)\big|_{\theta_k},\quad \mathbf{F} = \nabla^2_\theta \overline{D}_{\text{KL}}(\pi_{\theta_k} \| \pi_\theta)\big|_{\theta_k}

ratio=πθ(atst)πold(atst)=exp ⁣(logπθ(atst)logπold(atst))\text{ratio} = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)} = \exp\!\bigl(\log\pi_\theta(a_t|s_t) - \log\pi_{\text{old}}(a_t|s_t)\bigr)

Ref Model Old Model Current Model
ref_logprobs old_logprobs current_logprobs

ratio