model.zero_grad() # Reset gradients tensors
for i, (inputs, labels) in enumerate(training_set):
inputs = inputs.to(device) # Load data on GPU
predictions = model(inputs) # Forward pass
loss = loss_function(predictions, labels) # Compute loss function
loss.backward() # Backward pass
optimizer.step() # Now we can do an optimizer step
model.zero_grad() # Reset gradients tensors
if (i+1) % evaluation_steps == 0: # Evaluate the model when we...
evaluate_model()
BERT-base 모델 두 개를 이용해서 전체 구조를 학습시키고 싶은데 모델 두 개를 사용하자니 배치가 4밖에 올라가지 않는다…