Add MultiGPU support for DPR Training via DDP (#619)

tholor · web-flow · commit 2fabc3108f9c · 2020-11-12T09:44:42.000+01:00
* WIP initial global sync for loss

* rename vars

* wip ddp

* fix gathering of tensors for DDP

* fix vocab_size check. fix example script for DDP. fix check of rank in PH.

* fix typo. fix deprecation warning
diff --git a/examples/dpr_encoder.py b/examples/dpr_encoder.py
@@ -3,6 +3,8 @@
 import os
 import pprint
 from pathlib import Path
+import argparse
+
 
 from farm.data_handler.data_silo import DataSilo
 from farm.data_handler.processor import TextSimilarityProcessor
@@ -15,6 +17,16 @@
 from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
 from farm.eval import Evaluator
 
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=-1,
+                        help="local_rank for distributed training on GPUs")
+    args = parser.parse_args()
+    return args
+
+
 def dense_passage_retrieval():
     logging.basicConfig(
         format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
@@ -29,9 +41,9 @@ def dense_passage_retrieval():
     ########## Settings
     ##########################
     set_all_seeds(seed=42)
-    device, n_gpu = initialize_device_settings(use_cuda=True)
-    batch_size = 2
+    batch_size = 4
     n_epochs = 3
+    distributed = False # enable for multi GPU training via DDP
     evaluate_every = 1000
     question_lang_model = "facebook/dpr-question_encoder-single-nq-base"
     passage_lang_model = "facebook/dpr-ctx_encoder-single-nq-base"
@@ -43,7 +55,11 @@ def dense_passage_retrieval():
     train_filename = "nq-train.json"
     dev_filename = "nq-dev.json"
     test_filename = "nq-dev.json"
-    max_samples = None #load a smaller dataset (e.g. for debugging)
+    max_samples = None # load a smaller dataset (e.g. for debugging)
+
+    # For multi GPU Training via DDP we need to get the local rank
+    args = parse_arguments()
+    device, n_gpu = initialize_device_settings(use_cuda=True, local_rank=args.local_rank)
 
     # 1.Create question and passage tokenizers
     query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=question_lang_model,
@@ -58,11 +74,11 @@ def dense_passage_retrieval():
     metric = "text_similarity_metric"
     processor = TextSimilarityProcessor(tokenizer=query_tokenizer,
                              passage_tokenizer=passage_tokenizer,
-                             max_seq_len_query=256,
+                             max_seq_len_query=64,
                              max_seq_len_passage=256,
                              label_list=label_list,
                              metric=metric,
-                             data_dir="data/retriever",
+                             data_dir="../data/retriever",
                              train_filename=train_filename,
                              dev_filename=dev_filename,
                              test_filename=test_filename,
@@ -72,7 +88,7 @@ def dense_passage_retrieval():
 
     # 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
     # NOTE: In FARM, the dev set metrics differ from test set metrics in that they are calculated on a token level instead of a word level
-    data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=False)
+    data_silo = DataSilo(processor=processor, batch_size=batch_size, distributed=distributed)
 
 
     # 4. Create an BiAdaptiveModel+
@@ -104,7 +120,8 @@ def dense_passage_retrieval():
         n_batches=len(data_silo.loaders["train"]),
         n_epochs=n_epochs,
         grad_acc_steps=1,
-        device=device
+        device=device,
+        distributed=distributed
     )
 
     # 6. Feed everything to the Trainer, which keeps care of growing our model and evaluates it from time to time
diff --git a/farm/data_handler/data_silo.py b/farm/data_handler/data_silo.py
@@ -55,7 +55,8 @@ def __init__(
         :type batch_size: int
         :param eval_batch_size: The size of batch that should be returned by the DataLoaders for the dev and test set.
         :type eval_batch_size: int
-        :param distributed: Set to True if the program is running in a distributed setting.
+        :param distributed: Set to True if you are running in a distributed evn, e.g. using DistributedDataParallel.
+                            The DataSilo will init the DataLoader with a DistributedSampler() to distribute batches.
         :type distributed: bool
         :param automatic_loading: Set to False, if you don't want to automatically load data at initialization.
         :type automatic_loading: bool
diff --git a/farm/evaluation/metrics.py b/farm/evaluation/metrics.py
@@ -240,7 +240,7 @@ def text_similarity_avg_ranks(preds, labels):
 
     :param preds: list of numpy arrays of dimension n1 x n2 containing n2 predicted ranks for n1 sequences/queries
     :type preds: List of numpy array containing similarity scores for each sequence in batch
-    :param labels: list of arrays of dimension n1 x n2 where each array contains n2 labels(0/1) dindicating whether the sequence/passage is a positive(1) passage or hard_negative(0) passage
+    :param labels: list of arrays of dimension n1 x n2 where each array contains n2 labels(0/1) indicating whether the sequence/passage is a positive(1) passage or hard_negative(0) passage
     :type labels: List of list containing values(0/1)
 
     :return: average predicted ranks of positive sequence/passage for each sample/query
diff --git a/farm/modeling/biadaptive_model.py b/farm/modeling/biadaptive_model.py
@@ -353,7 +353,7 @@ def forward(self, **kwargs):
         :return: all logits as torch.tensor or multiple tensors.
         """
 
-        # Run forward pass of language model
+        # Run forward pass of both language models
         pooled_output = self.forward_lm(**kwargs)
 
         # Run forward pass of (multiple) prediction heads using the output from above
diff --git a/farm/modeling/prediction_head.py b/farm/modeling/prediction_head.py
@@ -11,9 +11,9 @@
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss, BCEWithLogitsLoss, NLLLoss
-
+from torch.distributed import all_gather
 from farm.data_handler.utils import is_json
-from farm.utils import convert_iob_to_simple_tags, try_get
+from farm.utils import convert_iob_to_simple_tags, try_get, all_gather_list
 from farm.modeling.predictions import QACandidate, QAPred
 
 logger = logging.getLogger(__name__)
@@ -1524,15 +1524,26 @@ class TextSimilarityHead(PredictionHead):
     """
     Trains a head on predicting the similarity of two texts like in Dense Passage Retrieval.
     """
-    def __init__(self, similarity_function="dot_product", **kwargs):
+    def __init__(self, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, **kwargs):
+        """
+        Init the TextSimilarityHead.
+
+        :param similarity_function: Function to calculate similarity between queries and passage embeddings.
+                                    Choose either "dot_product" (Default) or "cosine".
+        :param global_loss_buffer_size: Buffer size for all_gather() in DDP.
+                                        Increase if errors like "encoded data exceeds max_size ..." come up
+
+        :param kwargs:
+        """
+
         super(TextSimilarityHead, self).__init__()
 
         self.similarity_function = similarity_function
         self.loss_fct = NLLLoss(reduction="mean")
         self.task_name = "text_similarity"
         self.model_type = "text_similarity"
         self.ph_output_type = "per_sequence"
-
+        self.global_loss_buffer_size = global_loss_buffer_size
         self.generate_config()
 
     @classmethod
@@ -1627,15 +1638,56 @@ def logits_to_loss(self, logits: Tuple[torch.Tensor, torch.Tensor], **kwargs):
 
         :return: negative log likelihood loss from similarity scores
         """
+
+        # Check if DDP is initialized
+        try:
+            rank = torch.distributed.get_rank()
+        except AssertionError:
+            rank = -1
+
         # Prepare predicted scores
         query_vectors, passage_vectors = logits
-        softmax_scores = self._embeddings_to_scores(query_vectors, passage_vectors)
 
         # Prepare Labels
         lm_label_ids = kwargs.get(self.label_tensor_name)
         positive_idx_per_question = torch.nonzero((lm_label_ids.view(-1) == 1), as_tuple=False)
-        #TODO gather global tensors from all nodes for DDP
-        global_positive_idx_per_question = positive_idx_per_question
+
+        # Gather global embeddings from all distributed nodes (DDP)
+        if rank != -1:
+            q_vector_to_send = torch.empty_like(query_vectors).cpu().copy_(query_vectors).detach_()
+            p_vector_to_send = torch.empty_like(passage_vectors).cpu().copy_(passage_vectors).detach_()
+
+            global_question_passage_vectors = all_gather_list(
+                [q_vector_to_send, p_vector_to_send, positive_idx_per_question],
+                max_size=self.global_loss_buffer_size)
+
+            global_query_vectors = []
+            global_passage_vectors = []
+            global_positive_idx_per_question = []
+            total_passages = 0
+            for i, item in enumerate(global_question_passage_vectors):
+                q_vector, p_vectors, positive_idx = item
+
+                if i != rank:
+                    global_query_vectors.append(q_vector.to(query_vectors.device))
+                    global_passage_vectors.append(p_vectors.to(passage_vectors.device))
+                    global_positive_idx_per_question.extend([v + total_passages for v in positive_idx])
+                else:
+                    global_query_vectors.append(query_vectors)
+                    global_passage_vectors.append(passage_vectors)
+                    global_positive_idx_per_question.extend([v + total_passages for v in positive_idx_per_question])
+                total_passages += p_vectors.size(0)
+
+            global_query_vectors = torch.cat(global_query_vectors, dim=0)
+            global_passage_vectors = torch.cat(global_passage_vectors, dim=0)
+            global_positive_idx_per_question = torch.LongTensor(global_positive_idx_per_question)
+        else:
+            global_query_vectors = query_vectors
+            global_passage_vectors = passage_vectors
+            global_positive_idx_per_question = positive_idx_per_question
+
+        # Get similarity scores
+        softmax_scores = self._embeddings_to_scores(global_query_vectors, global_passage_vectors)
         targets = global_positive_idx_per_question.squeeze(-1).to(softmax_scores.device)
 
         # Calculate loss
@@ -1664,7 +1716,9 @@ def prepare_labels(self, **kwargs):
         """
         label_ids = kwargs.get(self.label_tensor_name)
         labels = torch.zeros(label_ids.size(0), label_ids.numel())
-        positive_indices = (label_ids.view(-1) == 1).nonzero()
+        
+        positive_indices = torch.nonzero(label_ids.view(-1) == 1, as_tuple=False)
+
         for i, indx in enumerate(positive_indices):
             labels[i, indx.item()] = 1
         return labels
diff --git a/farm/train.py b/farm/train.py
@@ -250,9 +250,8 @@ def train(self):
 
         # connect the prediction heads with the right output from processor
         self.model.connect_heads_with_processor(self.data_silo.processor.tasks, require_labels=True)
-        # Check that the tokenizer fits the language model
-        #TODO: make this compliant for DP / DDP where the model class is wrapped
-        if self.model._get_name() == 'BiAdaptiveModel':
+        # Check that the tokenizer(s) fits the language model(s)
+        if hasattr(self.model, "language_model2"):
             self.model.verify_vocab_size(vocab_size1=len(self.data_silo.processor.tokenizer),
                                          vocab_size2=len(self.data_silo.processor.passage_tokenizer))
         else:
@@ -297,7 +296,6 @@ def train(self):
 
                 # Move batch of samples to device
                 batch = {key: batch[key].to(self.device) for key in batch}
-
                 # Forward & backward pass through model
                 logits = self.model.forward(**batch)
                 per_sample_loss = self.model.logits_to_loss(logits=logits, global_step=self.global_step, **batch)
@@ -367,7 +365,7 @@ def train(self):
             self.model.connect_heads_with_processor(self.data_silo.processor.tasks, require_labels=True)
 
         # Eval on test set
-        if self.evaluator_test:
+        if self.evaluator_test and self.local_rank in [0, -1]:
             test_data_loader = self.data_silo.get_data_loader("test")
             if test_data_loader is not None:
                 evaluator_test = Evaluator(
diff --git a/farm/utils.py b/farm/utils.py
@@ -6,14 +6,15 @@
 import signal
 import numpy as np
 import torch
+import torch.distributed as dist
 from requests.exceptions import ConnectionError
 from torch import multiprocessing as mp
 import mlflow
 from copy import deepcopy
 import pandas as pd
 from tqdm import tqdm
 import time
-
+import pickle
 
 from farm.visual.ascii.images import WELCOME_BARN, WORKER_M, WORKER_F, WORKER_X
 
@@ -475,3 +476,74 @@ def calc_duration(self, start, end):
             return start.elapsed_time(end) / 1000
         else:
             return end - start
+
+
+# DDP utils
+
+def all_reduce(tensor, group=None):
+    if group is None:
+        group = dist.group.WORLD
+    return dist.all_reduce(tensor, group=group)
+
+
+def all_gather_list(data, group=None, max_size=16384):
+    """Gathers arbitrary data from all nodes into a list.
+    Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python
+    data. Note that *data* must be picklable.
+    Args:
+        data (Any): data from the local worker to be gathered on other workers
+        group (optional): group of the collective
+    """
+    SIZE_STORAGE_BYTES = 4  # int32 to encode the payload size
+
+    enc = pickle.dumps(data)
+    enc_size = len(enc)
+
+    if enc_size + SIZE_STORAGE_BYTES > max_size:
+        raise ValueError(
+            'encoded data exceeds max_size, this can be fixed by increasing buffer size: {}'.format(enc_size))
+
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    buffer_size = max_size * world_size
+
+    if not hasattr(all_gather_list, '_buffer') or \
+            all_gather_list._buffer.numel() < buffer_size:
+        all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size)
+        all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory()
+
+    buffer = all_gather_list._buffer
+    buffer.zero_()
+    cpu_buffer = all_gather_list._cpu_buffer
+
+    assert enc_size < 256 ** SIZE_STORAGE_BYTES, 'Encoded object size should be less than {} bytes'.format(
+        256 ** SIZE_STORAGE_BYTES)
+
+    size_bytes = enc_size.to_bytes(SIZE_STORAGE_BYTES, byteorder='big')
+
+    cpu_buffer[0:SIZE_STORAGE_BYTES] = torch.ByteTensor(list(size_bytes))
+    cpu_buffer[SIZE_STORAGE_BYTES: enc_size + SIZE_STORAGE_BYTES] = torch.ByteTensor(list(enc))
+
+    start = rank * max_size
+    size = enc_size + SIZE_STORAGE_BYTES
+    buffer[start: start + size].copy_(cpu_buffer[:size])
+
+    all_reduce(buffer, group=group)
+
+    try:
+        result = []
+        for i in range(world_size):
+            out_buffer = buffer[i * max_size: (i + 1) * max_size]
+            size = int.from_bytes(out_buffer[0:SIZE_STORAGE_BYTES], byteorder='big')
+            if size > 0:
+                result.append(pickle.loads(bytes(out_buffer[SIZE_STORAGE_BYTES: size + SIZE_STORAGE_BYTES].tolist())))
+        return result
+    except pickle.UnpicklingError:
+        raise Exception(
+            'Unable to unpickle data from other workers. all_gather_list requires all '
+            'workers to enter the function together, so this error usually indicates '
+            'that the workers have fallen out of sync somehow. Workers can fall out of '
+            'sync if one of them runs out of memory, or if there are other conditions '
+            'in your training script that can cause one worker to finish an epoch '
+            'while other workers are still iterating over their portions of the data.'
+        )