Source code for

"""Implements the two main architectures presented in the ECIR-2023 paper."""
import warnings
from typing import Optional, Tuple

import torch
from torch import nn
from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
from transformers import (
    PreTrainedModel, BertModel, DPRQuestionEncoder, DPRContextEncoder, 
    ViltPreTrainedModel, ViltModel, CLIPModel, CLIPConfig
from transformers.models.bert import BertConfig, BertPreTrainedModel

from .outputs import EncoderOutput, ECAEncoderOutput
from .image import ImageEmbedding, FaceEmbedding
from .utils import TanhGate
from .bert import BertAttention, BertEmbeddings, BertIntermediate, BertOutput, BertPooler, BertLayer

[docs]class MMConfig(BertConfig): """ Base configuration class for multimodal models based on BertConfig. Parameters ---------- *args, **kwargs: additional arguments are passed to BertConfig. n_images: int, optional Number of images to embed alongside with text. Each image can be mapped to multiple face features or image features. If greater than 1, will be assigned to a type embedding (analog to BERT). n_faces: int, optional Number of faces that the multimodal model should take as input. Defaults to 4. face_kwargs: dict, optional Keyword arguments used for the FaceEmbedding module. Defaults to dict(face_dim=512, bbox_dim=7). image_kwargs: dict, optional Keyword arguments used for as many ImageEmbedding modules (one per key). Defaults to { "clip-RN50": {"input_dim": 1024}, "imagenet-RN50": {"input_dim": 2048} } face_and_image_are_exclusive: bool, optional Whether face and full-image representation should be combined (default) or exclusive. Handled with attention masks in transformers no_text: bool, optional Whether to rely only on faces and images. In this case, only the [CLS] token embedding is concatenated to the image features. Defaults to False. gating: bool, optional Whether to use flamingo-style tanh gating (init at 0) [2]_ Defaults to no gating References ---------- .. [2] Jean-Baptiste Alayrac et al. (2022). Flamingo: a Visual Language Model for Few-Shot Learning. ArXiv:2204.14198. """ def __init__( self, *args, n_images=1, n_faces=4, face_kwargs=None, image_kwargs=None, face_and_image_are_exclusive=False, no_text=False, gating=False, **kwargs ): super().__init__(*args, **kwargs) self.n_images = n_images self.n_faces = n_faces if face_kwargs is None: self.face_kwargs = dict(face_dim=512, bbox_dim=7) else: self.face_kwargs = face_kwargs if image_kwargs is None: self.image_kwargs = { "clip-RN50": {"input_dim": 1024}, "imagenet-RN50": {"input_dim": 2048} } else: self.image_kwargs = image_kwargs self.face_and_image_are_exclusive = face_and_image_are_exclusive self.no_text = no_text self.gating = gating
[docs]class FlamantConfig(MMConfig): """ Hyperparameters for multimodal cross-attention layers Same defaults as BertConfig. """ def __init__(self, *args, multimodal_attention_every=1, image_num_attention_heads=12, image_intermediate_size=3072, image_hidden_dropout_prob=0.1, image_attention_probs_dropout_prob=0.1, **kwargs ): super().__init__(*args, **kwargs) self.multimodal_attention_every = multimodal_attention_every self.image_num_attention_heads = image_num_attention_heads self.image_intermediate_size = image_intermediate_size self.image_hidden_dropout_prob = image_hidden_dropout_prob self.image_attention_probs_dropout_prob = image_attention_probs_dropout_prob
[docs]def overwrite_bert_config(flamant_config): """ Overwrite BERT parameters in the input flamant_config if they start with "image_". See usage in FlamantLayer. Parameters ---------- flamant_config: FlamantConfig Returns ------- bert_config: BertConfig """ config_dict = flamant_config.to_dict() for k in list(config_dict.keys()): if k.startswith("image_"): # overwrite BERT parameter with the image version of Flamant config_dict[k[len("image_"):]] = config_dict.pop(k) return BertConfig.from_dict(config_dict)
[docs]class FlamantLayer(nn.Module): """Adapted from transformers.BertLayer""" def __init__(self, config): super().__init__() if config.chunk_size_feed_forward != 0: raise NotImplementedError() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 self.image_crossattention = BertAttention(overwrite_bert_config(config), position_embedding_type="absolute") # like BertIntermediate + BertOutput without residual connection and layer-norm # which must happen after gating self.image_ffw = nn.Sequential( nn.Linear(config.hidden_size, config.image_intermediate_size), # FIXME: does not take into account config.hidden_act # (because transformers.activations.ACT2FN returns a function and not a Module) # Also: Squared-ReLU is used in Flamingo nn.GELU(), nn.Linear(config.image_intermediate_size, config.hidden_size), nn.Dropout(config.hidden_dropout_prob) ) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.attention = BertAttention(config) self.is_decoder = config.is_decoder self.add_cross_attention = config.add_cross_attention if self.add_cross_attention: if not self.is_decoder: raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = BertAttention(config, position_embedding_type="absolute") self.intermediate = BertIntermediate(config) self.output = BertOutput(config) if config.gating: self.attn_gate, self.ffw_gate = TanhGate(), TanhGate() else: self.attn_gate, self.ffw_gate = nn.Identity(), nn.Identity()
[docs] def forward( self, hidden_states: torch.Tensor, image_embeddings: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, image_attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, output_attentions: Optional[bool] = False ) -> Tuple[torch.Tensor]: if past_key_value is not None or output_attentions: raise NotImplementedError() # Flamingo-style gated cross-attention # FIXME: BertAttention already has layer-norm and res connection hidden_states = self.attn_gate( self.image_crossattention( hidden_states, # query attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=image_embeddings, # key and value encoder_attention_mask=image_attention_mask, past_key_value=None, output_attentions=False )[0] ) + hidden_states hidden_states = self.ffw_gate(self.image_ffw(hidden_states)) + hidden_states # tough architectural choice: keep BERT-style post layer-norm # but it goes against the flamingo spirit of # "output should be the same as the pretrained language model after init" hidden_states = self.LayerNorm(hidden_states) # ========================== # # Below: standard BERT layer # # ========================== # self_attention_outputs = self.attention( hidden_states, attention_mask, head_mask, output_attentions=output_attentions, past_key_value=None, ) attention_output = self_attention_outputs[0] # if decoder, the last output is tuple of self-attn cache if self.is_decoder: raise NotImplementedError() else: outputs = self_attention_outputs[1:] # add self attentions if we output attention weights cross_attn_present_key_value = None if self.is_decoder and encoder_hidden_states is not None: raise NotImplementedError() layer_output = self.feed_forward_chunk(attention_output) outputs = (layer_output,) + outputs return outputs
[docs] def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) layer_output = self.output(intermediate_output, attention_output) return layer_output
[docs]class FlamantEncoder(nn.Module): """Like BertEncoder but with FlamantLayer instead of BertLayer every n layers""" def __init__(self, config): super().__init__() self.config = config self.layer = nn.ModuleList() for i in range(config.num_hidden_layers): if i % config.multimodal_attention_every == 0: self.layer.append(FlamantLayer(config)) else: self.layer.append(BertLayer(config)) self.gradient_checkpointing = False
[docs] def forward( self, hidden_states: torch.Tensor, image_embeddings: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, image_attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = False, output_hidden_states: Optional[bool] = False, return_dict: Optional[bool] = True, ): if use_cache: raise NotImplementedError() all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) inputs = dict( hidden_states=hidden_states, attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask ) # feed image embeddings for multimodal cross-attention if isinstance(layer_module, FlamantLayer): inputs = ( hidden_states, image_embeddings, attention_mask, image_attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) # standard BERT inputs else: inputs = ( hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask ) if self.gradient_checkpointing and def create_custom_forward(module): def custom_forward(*args): return module(*args) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), *inputs ) else: layer_outputs = layer_module(*inputs) hidden_states = layer_outputs[0] if output_attentions: all_self_attentions = all_self_attentions + (layer_outputs[1],) if self.config.add_cross_attention: all_cross_attentions = all_cross_attentions + (layer_outputs[2],) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) if not return_dict: return tuple( v for v in [ hidden_states, all_hidden_states, all_self_attentions, all_cross_attentions, ] if v is not None ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions, cross_attentions=all_cross_attentions, )
# TODO: refactor with *PreTrainedModel abstract classes
[docs]class FlamantModel(BertPreTrainedModel): """ Fuses modalities with gated cross-attention layers like in Flamingo [2]_ Adapted from transformers.BertModel """ config_class = FlamantConfig load_tf_weights = None def __init__(self, config, add_pooling_layer=False): super().__init__(config) self.config = config self.embeddings = BertEmbeddings(config) self.encoder = FlamantEncoder(config) self.pooler = BertPooler(config) if add_pooling_layer else None if self.config.n_images > 1: self.image_type_embeddings = nn.Embedding(self.config.n_images, self.config.hidden_size) image_layer_norm = self.config.layer_norm_eps else: image_layer_norm = None if self.config.n_faces > 0: self.face_embedding = FaceEmbedding(embedding_dim=self.config.hidden_size, dropout=self.config.hidden_dropout_prob, layer_norm_eps=self.config.layer_norm_eps, **self.config.face_kwargs) else: self.face_embedding = None self.image_embeddings, self.image_gates = nn.ModuleDict(), nn.ModuleDict() for name, image_kwarg in self.config.image_kwargs.items(): self.image_embeddings[name] = ImageEmbedding(embedding_dim=self.config.hidden_size, dropout=self.config.hidden_dropout_prob, layer_norm_eps=image_layer_norm, **image_kwarg) self.weights_to_log = {} # add pointers to the gate parameters so that they are logged in trainer if self.config.gating: for i, layer_module in enumerate(self.encoder.layer): if isinstance(layer_module, FlamantLayer): self.weights_to_log[f"attn_gate_{i}"] = layer_module.attn_gate.gate_param self.weights_to_log[f"ffw_gate_{i}"] = layer_module.ffw_gate.gate_param self.post_init()
[docs] def get_input_embeddings(self): return self.embeddings.word_embeddings
[docs] def set_input_embeddings(self, value): self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel """ for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads)
[docs] def forward(self, text_inputs, face_inputs, image_inputs, output_attentions=False, output_hidden_states=False, return_dict=True): """ Arguments --------- text_inputs: dict[str, torch.LongTensor] usual BERT inputs, see transformers.BertModel face_inputs: dict[str, torch.FloatTensor] { "face": (batch_size, n_images, n_faces, face_dim), "bbox": (batch_size, n_images, n_faces, bbox_dim), "attention_mask": (batch_size, n_images, n_faces) } image_inputs: dict[str, dict[str, torch.FloatTensor]] { model: { "input": (batch_size, n_images, image_dim) "attention_mask": (batch_size, n_images) } } """ # reshape faces faces = face_inputs['face'] batch_size, n_images, n_faces, face_dim = faces.shape if n_faces > 0: if n_images > 1: image_type_ids = torch.zeros((batch_size, n_images, n_faces), dtype=torch.long, device=faces.device) # broadcast arange to the right shape image_type_ids += torch.arange(n_images, dtype=torch.long, device=faces.device).reshape(1, n_images, 1) image_type_embeddings = self.image_type_embeddings(image_type_ids.reshape(batch_size*n_images*n_faces)) else: image_type_embeddings = None faces = faces.reshape(batch_size*n_images*n_faces, face_dim) bbox = face_inputs['bbox'].reshape(batch_size*n_images*n_faces, -1) face_output = self.face_embedding(face=faces, bbox=bbox, image_type_embeddings=image_type_embeddings) face_output = face_output.reshape(batch_size, n_images*n_faces, -1) else: face_output = torch.zeros(batch_size, 0, self.config.hidden_size, device=faces.device) face_attention_mask = face_inputs["attention_mask"].reshape(batch_size, n_images*n_faces) # embed images if image_inputs: if n_images > 1: image_type_ids = torch.zeros((batch_size, n_images), dtype=torch.long, device=faces.device) image_type_ids += torch.arange(n_images, dtype=torch.long, device=faces.device) image_type_embeddings = self.image_type_embeddings(image_type_ids.reshape(batch_size*n_images)) else: image_type_embeddings = None image_outputs, image_attention_mask = [], [] for name, image in image_inputs.items(): image_output = self.image_embeddings[name]( image['input'].reshape(batch_size*n_images, -1), image_type_embeddings=image_type_embeddings ) image_outputs.append(image_output.reshape(batch_size, n_images, -1)) image_attention_mask.append(image['attention_mask']) # (n_models, batch_size, n_images, embedding_dim) -> (batch_size, n_images*n_models, embedding_dim) image_outputs =, dim=1) image_attention_mask =, dim=1) else: image_outputs = torch.zeros(batch_size, 0, self.config.hidden_size, device=faces.device) image_attention_mask = torch.zeros(batch_size, 0, device=faces.device) if self.config.face_and_image_are_exclusive: # indices at the batch level: at least one face detected (i.e. not masked) where_are_faces = face_attention_mask.nonzero()[:,0].unique() # mask images if at least one face was detected image_attention_mask[where_are_faces] = 0 if self.config.no_text: raise NotImplementedError() # embed text: (batch_size, sequence_length, embedding_dim) token_type_ids = text_inputs.get('token_type_ids') text_embeddings = self.embeddings(input_ids=text_inputs['input_ids'], token_type_ids=token_type_ids) attention_mask = self.get_extended_attention_mask( text_inputs['attention_mask'], text_embeddings.shape[:-1], text_embeddings.device) # (batch_size, n_faces+n_models, embedding_dim) image_embeddings =, image_outputs), dim=1) image_attention_mask =, image_attention_mask), dim=1) # N. B. looks like this produces the same output as get_extended_attention_mask # I stick to what is in BertModel implementation image_attention_mask = self.invert_attention_mask(image_attention_mask) outputs = self.encoder( text_embeddings, image_embeddings, attention_mask=attention_mask, image_attention_mask=image_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) # same as DPR: extract representation from [CLS]: the first token sequence_output = outputs[0] pooled_output = sequence_output[:, 0, :] if not return_dict: return (pooled_output, ) + outputs[2:] return ECAEncoderOutput( pooler_output=pooled_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
[docs]class ViltForIR(ViltPreTrainedModel): """ Pools ViLT using the representation of the [CLS] token, i.e. DPR-style, *not* with ViltPooler (ITM pre-trained layer), except if add_pooling_layer=True """ def __init__(self, config, add_pooling_layer=False): super().__init__(config) self.vilt = ViltModel(config, add_pooling_layer=add_pooling_layer) # N. B. post_init is called in ViltModel
[docs] def forward(self, *args, return_dict=True, **kwargs): outputs = self.vilt(*args, return_dict=return_dict, **kwargs) # default behavior: pooling from [CLS] instead of ViltPooler (ITM pre-trained layer) if outputs.pooler_output is None: outputs.pooler_output = outputs.last_hidden_state[:, 0] # else keep pooling from ViltPooler return outputs
[docs]class CLIPForIR(PreTrainedModel): """ Fuses image and text embeddings simply by summing them to be compatible with BiEncoder. Because BiEncoder uses dot-product similarity, note that this will be equivalent to computing: i_q*i_p + i_q*t_p + t_q*t_p + t_q*i_p Where i, t stand for image, text and _q and _p suffixes stand for question and passage (or context) i.e. computing all mono-modal and cross-modal similarities. But it might be worth using another trainee than BiEncoder to be able to scale these similarities. """ config_class = CLIPConfig base_model_prefix = "clip" def __init__(self, config): super().__init__(config) self.clip = CLIPModel(config) # N. B. post_init is called in CLIPModel
[docs] def forward(self, *args, return_dict=True, return_loss=False, **kwargs): outputs = self.clip(*args, return_dict=return_dict, return_loss=return_loss, **kwargs) multimodal_output = outputs.text_embeds + outputs.image_embeds return EncoderOutput(pooler_output=multimodal_output)
[docs]class ECAEncoder(PreTrainedModel): """ Text and image are fused by concatenating them at the sequence-level then feeding them to BERT, à la UNITER [1]_ - one face ≃ one token - one image ≃ one token The multimodal representation is obtained from the "[CLS]" token. When using gating (see MMConfig), it is done before the attention layer, unlike in Flamingo [2]_ References ---------- .. [1] Chen, Y.C., Li, L., Yu, L., El Kholy, A., Ahmed, F., Gan, Z., Cheng, Y., Liu, J.: Uniter: Universal image-text representation learning. In: European Conference on Computer Vision. pp. 104–120. Springer (2020) """ config_class = MMConfig load_tf_weights = None base_model_prefix = "bert_model" def __init__(self, config, init_weights_like_bert=False): if init_weights_like_bert: self._init_weights = self._init_weights_like_bert else: self._init_weights = self._init_weights_like_ict super().__init__(config) self.config = config self.bert_model = BertModel(config, add_pooling_layer=False) # add pointers to the gate parameters so that they are logged in trainer self.weights_to_log = {} if self.config.n_images > 1: self.image_type_embeddings = nn.Embedding(self.config.n_images, self.config.hidden_size) image_layer_norm = self.config.layer_norm_eps else: image_layer_norm = None if self.config.n_faces > 0: self.face_embedding = FaceEmbedding(embedding_dim=self.config.hidden_size, dropout=self.config.hidden_dropout_prob, layer_norm_eps=self.config.layer_norm_eps, **self.config.face_kwargs) if self.config.gating: self.face_gate = TanhGate() self.weights_to_log["face_gate"] = self.face_gate.gate_param else: self.face_gate = nn.Identity() else: self.face_embedding = None self.image_embeddings, self.image_gates = nn.ModuleDict(), nn.ModuleDict() for name, image_kwarg in self.config.image_kwargs.items(): self.image_embeddings[name] = ImageEmbedding(embedding_dim=self.config.hidden_size, dropout=self.config.hidden_dropout_prob, layer_norm_eps=image_layer_norm, **image_kwarg) if self.config.gating: self.image_gates[name] = TanhGate() self.weights_to_log[f"{name}_gate"] = self.image_gates[name].gate_param else: self.image_gates[name] = nn.Identity() def _init_weights_like_ict(self, module): # same as BERT if isinstance(module, nn.Embedding):, std=self.config.initializer_range) if module.padding_idx is not None:[module.padding_idx].zero_() # keep torch defaults for linear layers def _init_weights_like_bert(self, module): # taken from BertPreTrainedModel if isinstance(module, nn.Linear): # Slightly different from the TF version which uses truncated_normal for initialization # cf, std=self.config.initializer_range) if module.bias is not None: elif isinstance(module, nn.Embedding):, std=self.config.initializer_range) if module.padding_idx is not None:[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm):
[docs] def forward(self, text_inputs, face_inputs, image_inputs, output_attentions=False, output_hidden_states=False, return_dict=True): """ Arguments --------- text_inputs: dict[str, torch.LongTensor] usual BERT inputs, see transformers.BertModel face_inputs: dict[str, torch.FloatTensor] { "face": (batch_size, n_images, n_faces, face_dim), "bbox": (batch_size, n_images, n_faces, bbox_dim), "attention_mask": (batch_size, n_images, n_faces) } image_inputs: dict[str, dict[str, torch.FloatTensor]] { model: { "input": (batch_size, n_images, image_dim) "attention_mask": (batch_size, n_images) } } """ # reshape faces faces = face_inputs['face'] batch_size, n_images, n_faces, face_dim = faces.shape assert n_images == self.config.n_images if n_faces > 0: if n_images > 1: image_type_ids = torch.zeros((batch_size, n_images, n_faces), dtype=torch.long, device=faces.device) # broadcast arange to the right shape image_type_ids += torch.arange(n_images, dtype=torch.long, device=faces.device).reshape(1, n_images, 1) image_type_embeddings = self.image_type_embeddings(image_type_ids.reshape(batch_size*n_images*n_faces)) else: image_type_embeddings = None faces = faces.reshape(batch_size*n_images*n_faces, face_dim) bbox = face_inputs['bbox'].reshape(batch_size*n_images*n_faces, -1) face_output = self.face_embedding(face=faces, bbox=bbox, image_type_embeddings=image_type_embeddings) face_output = face_output.reshape(batch_size, n_images*n_faces, -1) # maybe gate faces face_output = self.face_gate(face_output) else: face_output = torch.zeros(batch_size, 0, self.config.hidden_size, device=faces.device) face_attention_mask = face_inputs["attention_mask"].reshape(batch_size, n_images*n_faces) # embed images if image_inputs: if n_images > 1: image_type_ids = torch.zeros((batch_size, n_images), dtype=torch.long, device=faces.device) image_type_ids += torch.arange(n_images, dtype=torch.long, device=faces.device) image_type_embeddings = self.image_type_embeddings(image_type_ids.reshape(batch_size*n_images)) else: image_type_embeddings = None image_outputs, image_attention_mask = [], [] for name, image in image_inputs.items(): image_output = self.image_embeddings[name]( image['input'].reshape(batch_size*n_images, -1), image_type_embeddings=image_type_embeddings ) # maybe gate image image_output = self.image_gates[name](image_output) image_outputs.append(image_output.reshape(batch_size, n_images, -1)) image_attention_mask.append(image['attention_mask']) # (n_models, batch_size, n_images, embedding_dim) -> (batch_size, n_images*n_models, embedding_dim) image_outputs =, dim=1) image_attention_mask =, dim=1) else: image_outputs = torch.zeros(batch_size, 0, self.config.hidden_size, device=faces.device) image_attention_mask = torch.zeros(batch_size, 0, device=faces.device) if self.config.face_and_image_are_exclusive: # indices at the batch level: at least one face detected (i.e. not masked) where_are_faces = face_attention_mask.nonzero()[:,0].unique() # mask images if at least one face was detected image_attention_mask[where_are_faces] = 0 token_type_ids = text_inputs.get('token_type_ids') # keep only keep [CLS] token if self.config.no_text: text_inputs['input_ids'] = text_inputs['input_ids'][:, :1] text_inputs['attention_mask'] = text_inputs['attention_mask'][:, :1] if token_type_ids is not None: token_type_ids = token_type_ids[:, :1] # embed text: (batch_size, sequence_length, embedding_dim) text_embeddings = self.bert_model.embeddings(input_ids=text_inputs['input_ids'], token_type_ids=token_type_ids) # (batch_size, sequence_length+(n_faces+n_models)*n_images, embedding_dim) multimodal_embeddings =, face_output, image_outputs), dim=1) attention_mask =['attention_mask'], face_attention_mask, image_attention_mask), dim=1) extended_attention_mask = self.bert_model.get_extended_attention_mask( attention_mask, multimodal_embeddings.shape[:-1], multimodal_embeddings.device ) outputs = self.bert_model.encoder(multimodal_embeddings, attention_mask=extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) # same as DPR: extract representation from [CLS]: the first token sequence_output = outputs[0] pooled_output = sequence_output[:, 0, :] if not return_dict: return (pooled_output, sequence_output) + outputs[2:] return ECAEncoderOutput( pooler_output=pooled_output, last_hidden_state=sequence_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
[docs]class ILFConfig(MMConfig): """ Same as MMConfig with an extra parameter: question_encoder: bool, optional Whether to use DPRQuestionEncoder (default) or DPRContextEncoder. This makes no real differences in the architecture, only the name changes. """ def __init__(self, *args, question_encoder=True, **kwargs ): super().__init__(*args, **kwargs) self.question_encoder = question_encoder
[docs]class IntermediateLinearFusion(PreTrainedModel): """Fuses DPR’s text representation with image embeddings by projecting them linearly in the same space""" config_class = ILFConfig load_tf_weights = None base_model_prefix = "dpr_encoder" def __init__(self, config): if config.n_images > 1: raise NotImplementedError() super().__init__(config) self.config = config if self.config.question_encoder: self.dpr_encoder = DPRQuestionEncoder(config) else: self.dpr_encoder = DPRContextEncoder(config) if self.config.n_faces > 0: self.face_embedding = FaceEmbedding(embedding_dim=self.config.hidden_size, dropout=self.config.hidden_dropout_prob, layer_norm_eps=self.config.layer_norm_eps, **self.config.face_kwargs) else: self.face_embedding = None self.image_embeddings = nn.ModuleDict() for name, image_kwarg in self.config.image_kwargs.items(): self.image_embeddings[name] = ImageEmbedding(embedding_dim=self.config.hidden_size, dropout=self.config.hidden_dropout_prob, **image_kwarg) self.dpr_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size) self.LayerNorm = nn.LayerNorm(self.config.hidden_size, eps=self.config.layer_norm_eps) self.dropout = nn.Dropout(self.config.hidden_dropout_prob) def _init_weights(self, module): # same as BERT if isinstance(module, nn.Embedding):, std=self.config.initializer_range) if module.padding_idx is not None:[module.padding_idx].zero_() # keep torch defaults for linear layers
[docs] def forward(self, text_inputs, face_inputs, image_inputs): """ Arguments --------- text_inputs: dict[str, torch.LongTensor] usual BERT inputs, see transformers.DPRQuestionEncoder face_inputs: dict[str, torch.FloatTensor] { "face": (batch_size, n_faces, face_dim), "bbox": (batch_size, n_faces, bbox_dim), "attention_mask": (batch_size, n_faces) } image_inputs: dict[str, dict[str, torch.FloatTensor]] { model: { "input": (batch_size, image_dim) "attention_mask": (batch_size, ) } } """ # embed text output = self.dpr_encoder(**text_inputs).pooler_output output = self.dpr_proj(output) # reshape faces faces = face_inputs['face'] batch_size, n_images, n_faces, face_dim = faces.shape if n_images > 1: raise NotImplementedError() if n_faces > 0: faces = faces.reshape(batch_size * n_faces, face_dim) # embed batch of size batch_size*n_faces face_output = self.face_embedding(face=faces, bbox=face_inputs['bbox'].reshape(batch_size * n_faces, -1)) face_output = face_output.reshape(batch_size, n_faces, -1) # sum over all faces face_output = face_output.sum(axis=1) # fuse text and faces output += face_output # fuse text and image if self.config.face_and_image_are_exclusive: face_attention_mask = face_inputs["attention_mask"] # indices at the batch level: at least one face detected (i.e. not masked) where_are_faces = face_attention_mask.nonzero()[:,0].unique() for name, image in image_inputs.items(): # mask images if at least one face was detected if self.config.face_and_image_are_exclusive: image['input'][where_are_faces] = 0 output += self.image_embeddings[name](image['input'].reshape(batch_size, -1)) output = self.LayerNorm(output) output = self.dropout(output) return EncoderOutput(pooler_output=output)