modeling script

Browse files

Files changed (16) hide show

.gitignore +15 -0
modeling.py +66 -0
ultra/__init__.py +0 -0
ultra/base_nbfnet.py +336 -0
ultra/datasets.py +1095 -0
ultra/eval.py +153 -0
ultra/layers.py +234 -0
ultra/models.py +214 -0
ultra/rspmm/rspmm.py +204 -0
ultra/rspmm/source/operator.cuh +82 -0
ultra/rspmm/source/rspmm.cpp +283 -0
ultra/rspmm/source/rspmm.cu +386 -0
ultra/rspmm/source/rspmm.h +108 -0
ultra/rspmm/source/util.cuh +28 -0
ultra/tasks.py +201 -0
ultra/util.py +172 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+output/
+.vscode/
+.DS_Store
+datasets/
+ckpts/
+*.csv
+*.txt

modeling.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import sys
+from transformers import PretrainedConfig, PreTrainedModel
+#sys.path.append(os.path.dirname(os.path.dirname(__file__)))
+from ultra.models import Ultra
+from ultra.datasets import WN18RR, CoDExSmall, FB15k237, FB15k237Inductive
+from ultra.eval import test
+class UltraConfig(PretrainedConfig):
+    model_type = "ultra"
+    def __init__(
+            self,
+            relation_model_layers: int = 6,
+            relation_model_dim: int = 64,
+            entity_model_layers: int = 6,
+            entity_model_dim: int = 64,
+            **kwargs):
+        self.relation_model_cfg = dict(
+            input_dim=relation_model_dim,
+            hidden_dims=[relation_model_dim]*relation_model_layers,
+            message_func="distmult",
+            aggregate_func="sum",
+            short_cut=True,
+            layer_norm=True
+        )
+        self.entity_model_cfg = dict(
+            input_dim=entity_model_dim,
+            hidden_dims=[entity_model_dim]*entity_model_layers,
+            message_func="distmult",
+            aggregate_func="sum",
+            short_cut=True,
+            layer_norm=True
+        )
+        super().__init__(**kwargs)
+class UltraLinkPrediction(PreTrainedModel):
+    config_class = UltraConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Ultra(
+            rel_model_cfg=config.relation_model_cfg,
+            entity_model_cfg=config.entity_model_cfg,
+        )
+    def forward(self, data, batch):
+        # data: PyG data object
+        # batch shape: (bs, 1+num_negs, 3)
+        return self.model.forward(data, batch)
+if __name__ == "__main__":
+    model = UltraLinkPrediction.from_pretrained("mgalkin/ultra_50g")
+    dataset = CoDExSmall(root="./datasets/")
+    test(model, mode="test", dataset=dataset, gpus=None)
+    # mrr:      0.497697
+    # hits@10:  0.685175

ultra/__init__.py ADDED Viewed

File without changes

ultra/base_nbfnet.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import copy
+from collections.abc import Sequence
+import torch
+from torch import nn, autograd
+from torch_scatter import scatter_add
+from . import tasks, layers
+class BaseNBFNet(nn.Module):
+    def __init__(self, input_dim, hidden_dims, num_relation, message_func="distmult", aggregate_func="sum",
+                 short_cut=False, layer_norm=False, activation="relu", concat_hidden=False, num_mlp_layer=2,
+                 dependent=False, remove_one_hop=False, num_beam=10, path_topk=10, **kwargs):
+        super(BaseNBFNet, self).__init__()
+        if not isinstance(hidden_dims, Sequence):
+            hidden_dims = [hidden_dims]
+        self.dims = [input_dim] + list(hidden_dims)
+        self.num_relation = num_relation
+        self.short_cut = short_cut  # whether to use residual connections between GNN layers
+        self.concat_hidden = concat_hidden  # whether to compute final states as a function of all layer outputs or last
+        self.remove_one_hop = remove_one_hop  # whether to dynamically remove one-hop edges from edge_index
+        self.num_beam = num_beam
+        self.path_topk = path_topk
+        self.message_func = message_func
+        self.aggregate_func = aggregate_func
+        self.layer_norm = layer_norm
+        self.activation = activation
+        self.num_mlp_layers = num_mlp_layer
+        # self.layers = nn.ModuleList()
+        # for i in range(len(self.dims) - 1):
+        #     self.layers.append(layers.GeneralizedRelationalConv(self.dims[i], self.dims[i + 1], num_relation,
+        #                                                         self.dims[0], message_func, aggregate_func, layer_norm,
+        #                                                         activation, dependent))
+        # feature_dim = (sum(hidden_dims) if concat_hidden else hidden_dims[-1]) + input_dim
+        # # additional relation embedding which serves as an initial 'query' for the NBFNet forward pass
+        # # each layer has its own learnable relations matrix, so we send the total number of relations, too
+        # self.query = nn.Embedding(num_relation, input_dim)
+        # self.mlp = nn.Sequential()
+        # mlp = []
+        # for i in range(num_mlp_layer - 1):
+        #     mlp.append(nn.Linear(feature_dim, feature_dim))
+        #     mlp.append(nn.ReLU())
+        # mlp.append(nn.Linear(feature_dim, 1))
+        # self.mlp = nn.Sequential(*mlp)
+    def remove_easy_edges(self, data, h_index, t_index, r_index=None):
+        # we remove training edges (we need to predict them at training time) from the edge index
+        # think of it as a dynamic edge dropout
+        h_index_ext = torch.cat([h_index, t_index], dim=-1)
+        t_index_ext = torch.cat([t_index, h_index], dim=-1)
+        r_index_ext = torch.cat([r_index, r_index + data.num_relations // 2], dim=-1)
+        if self.remove_one_hop:
+            # we remove all existing immediate edges between heads and tails in the batch
+            edge_index = data.edge_index
+            easy_edge = torch.stack([h_index_ext, t_index_ext]).flatten(1)
+            index = tasks.edge_match(edge_index, easy_edge)[0]
+            mask = ~index_to_mask(index, data.num_edges)
+        else:
+            # we remove existing immediate edges between heads and tails in the batch with the given relation
+            edge_index = torch.cat([data.edge_index, data.edge_type.unsqueeze(0)])
+            # note that here we add relation types r_index_ext to the matching query
+            easy_edge = torch.stack([h_index_ext, t_index_ext, r_index_ext]).flatten(1)
+            index = tasks.edge_match(edge_index, easy_edge)[0]
+            mask = ~index_to_mask(index, data.num_edges)
+        data = copy.copy(data)
+        data.edge_index = data.edge_index[:, mask]
+        data.edge_type = data.edge_type[mask]
+        return data
+    def negative_sample_to_tail(self, h_index, t_index, r_index, num_direct_rel):
+        # convert p(h | t, r) to p(t' | h', r')
+        # h' = t, r' = r^{-1}, t' = h
+        is_t_neg = (h_index == h_index[:, [0]]).all(dim=-1, keepdim=True)
+        new_h_index = torch.where(is_t_neg, h_index, t_index)
+        new_t_index = torch.where(is_t_neg, t_index, h_index)
+        new_r_index = torch.where(is_t_neg, r_index, r_index + num_direct_rel)
+        return new_h_index, new_t_index, new_r_index
+    def bellmanford(self, data, h_index, r_index, separate_grad=False):
+        batch_size = len(r_index)
+        # initialize queries (relation types of the given triples)
+        query = self.query(r_index)
+        index = h_index.unsqueeze(-1).expand_as(query)
+        # initial (boundary) condition - initialize all node states as zeros
+        boundary = torch.zeros(batch_size, data.num_nodes, self.dims[0], device=h_index.device)
+        # by the scatter operation we put query (relation) embeddings as init features of source (index) nodes
+        boundary.scatter_add_(1, index.unsqueeze(1), query.unsqueeze(1))
+        size = (data.num_nodes, data.num_nodes)
+        edge_weight = torch.ones(data.num_edges, device=h_index.device)
+        hiddens = []
+        edge_weights = []
+        layer_input = boundary
+        for layer in self.layers:
+            if separate_grad:
+                edge_weight = edge_weight.clone().requires_grad_()
+            # Bellman-Ford iteration, we send the original boundary condition in addition to the updated node states
+            hidden = layer(layer_input, query, boundary, data.edge_index, data.edge_type, size, edge_weight)
+            if self.short_cut and hidden.shape == layer_input.shape:
+                # residual connection here
+                hidden = hidden + layer_input
+            hiddens.append(hidden)
+            edge_weights.append(edge_weight)
+            layer_input = hidden
+        # original query (relation type) embeddings
+        node_query = query.unsqueeze(1).expand(-1, data.num_nodes, -1) # (batch_size, num_nodes, input_dim)
+        if self.concat_hidden:
+            output = torch.cat(hiddens + [node_query], dim=-1)
+        else:
+            output = torch.cat([hiddens[-1], node_query], dim=-1)
+        return {
+            "node_feature": output,
+            "edge_weights": edge_weights,
+        }
+    def forward(self, data, batch):
+        h_index, t_index, r_index = batch.unbind(-1)
+        if self.training:
+            # Edge dropout in the training mode
+            # here we want to remove immediate edges (head, relation, tail) from the edge_index and edge_types
+            # to make NBFNet iteration learn non-trivial paths
+            data = self.remove_easy_edges(data, h_index, t_index, r_index, data.num_relations // 2)
+        shape = h_index.shape
+        # turn all triples in a batch into a tail prediction mode
+        h_index, t_index, r_index = self.negative_sample_to_tail(h_index, t_index, r_index, num_direct_rel=data.num_relations // 2)
+        assert (h_index[:, [0]] == h_index).all()
+        assert (r_index[:, [0]] == r_index).all()
+        # message passing and updated node representations
+        output = self.bellmanford(data, h_index[:, 0], r_index[:, 0])  # (num_nodes, batch_size, feature_dim）
+        feature = output["node_feature"]
+        index = t_index.unsqueeze(-1).expand(-1, -1, feature.shape[-1])
+        # extract representations of tail entities from the updated node states
+        feature = feature.gather(1, index)  # (batch_size, num_negative + 1, feature_dim)
+        # probability logit for each tail node in the batch
+        # (batch_size, num_negative + 1, dim) -> (batch_size, num_negative + 1)
+        score = self.mlp(feature).squeeze(-1)
+        return score.view(shape)
+    def visualize(self, data, batch):
+        assert batch.shape == (1, 3)
+        h_index, t_index, r_index = batch.unbind(-1)
+        output = self.bellmanford(data, h_index, r_index, separate_grad=True)
+        feature = output["node_feature"]
+        edge_weights = output["edge_weights"]
+        index = t_index.unsqueeze(0).unsqueeze(-1).expand(-1, -1, feature.shape[-1])
+        feature = feature.gather(1, index).squeeze(0)
+        score = self.mlp(feature).squeeze(-1)
+        edge_grads = autograd.grad(score, edge_weights)
+        distances, back_edges = self.beam_search_distance(data, edge_grads, h_index, t_index, self.num_beam)
+        paths, weights = self.topk_average_length(distances, back_edges, t_index, self.path_topk)
+        return paths, weights
+    @torch.no_grad()
+    def beam_search_distance(self, data, edge_grads, h_index, t_index, num_beam=10):
+        # beam search the top-k distance from h to t (and to every other node)
+        num_nodes = data.num_nodes
+        input = torch.full((num_nodes, num_beam), float("-inf"), device=h_index.device)
+        input[h_index, 0] = 0
+        edge_mask = data.edge_index[0, :] != t_index
+        distances = []
+        back_edges = []
+        for edge_grad in edge_grads:
+            # we don't allow any path goes out of t once it arrives at t
+            node_in, node_out = data.edge_index[:, edge_mask]
+            relation = data.edge_type[edge_mask]
+            edge_grad = edge_grad[edge_mask]
+            message = input[node_in] + edge_grad.unsqueeze(-1) # (num_edges, num_beam)
+            # (num_edges, num_beam, 3)
+            msg_source = torch.stack([node_in, node_out, relation], dim=-1).unsqueeze(1).expand(-1, num_beam, -1)
+            # (num_edges, num_beam)
+            is_duplicate = torch.isclose(message.unsqueeze(-1), message.unsqueeze(-2)) & \
+                           (msg_source.unsqueeze(-2) == msg_source.unsqueeze(-3)).all(dim=-1)
+            # pick the first occurrence as the ranking in the previous node's beam
+            # this makes deduplication easier later
+            # and store it in msg_source
+            is_duplicate = is_duplicate.float() - \
+                           torch.arange(num_beam, dtype=torch.float, device=message.device) / (num_beam + 1)
+            prev_rank = is_duplicate.argmax(dim=-1, keepdim=True)
+            msg_source = torch.cat([msg_source, prev_rank], dim=-1) # (num_edges, num_beam, 4)
+            node_out, order = node_out.sort()
+            node_out_set = torch.unique(node_out)
+            # sort messages w.r.t. node_out
+            message = message[order].flatten() # (num_edges * num_beam)
+            msg_source = msg_source[order].flatten(0, -2) # (num_edges * num_beam, 4)
+            size = node_out.bincount(minlength=num_nodes)
+            msg2out = size_to_index(size[node_out_set] * num_beam)
+            # deduplicate messages that are from the same source and the same beam
+            is_duplicate = (msg_source[1:] == msg_source[:-1]).all(dim=-1)
+            is_duplicate = torch.cat([torch.zeros(1, dtype=torch.bool, device=message.device), is_duplicate])
+            message = message[~is_duplicate]
+            msg_source = msg_source[~is_duplicate]
+            msg2out = msg2out[~is_duplicate]
+            size = msg2out.bincount(minlength=len(node_out_set))
+            if not torch.isinf(message).all():
+                # take the topk messages from the neighborhood
+                # distance: (len(node_out_set) * num_beam)
+                distance, rel_index = scatter_topk(message, size, k=num_beam)
+                abs_index = rel_index + (size.cumsum(0) - size).unsqueeze(-1)
+                # store msg_source for backtracking
+                back_edge = msg_source[abs_index] # (len(node_out_set) * num_beam, 4)
+                distance = distance.view(len(node_out_set), num_beam)
+                back_edge = back_edge.view(len(node_out_set), num_beam, 4)
+                # scatter distance / back_edge back to all nodes
+                distance = scatter_add(distance, node_out_set, dim=0, dim_size=num_nodes) # (num_nodes, num_beam)
+                back_edge = scatter_add(back_edge, node_out_set, dim=0, dim_size=num_nodes) # (num_nodes, num_beam, 4)
+            else:
+                distance = torch.full((num_nodes, num_beam), float("-inf"), device=message.device)
+                back_edge = torch.zeros(num_nodes, num_beam, 4, dtype=torch.long, device=message.device)
+            distances.append(distance)
+            back_edges.append(back_edge)
+            input = distance
+        return distances, back_edges
+    def topk_average_length(self, distances, back_edges, t_index, k=10):
+        # backtrack distances and back_edges to generate the paths
+        paths = []
+        average_lengths = []
+        for i in range(len(distances)):
+            distance, order = distances[i][t_index].flatten(0, -1).sort(descending=True)
+            back_edge = back_edges[i][t_index].flatten(0, -2)[order]
+            for d, (h, t, r, prev_rank) in zip(distance[:k].tolist(), back_edge[:k].tolist()):
+                if d == float("-inf"):
+                    break
+                path = [(h, t, r)]
+                for j in range(i - 1, -1, -1):
+                    h, t, r, prev_rank = back_edges[j][h, prev_rank].tolist()
+                    path.append((h, t, r))
+                paths.append(path[::-1])
+                average_lengths.append(d / len(path))
+        if paths:
+            average_lengths, paths = zip(*sorted(zip(average_lengths, paths), reverse=True)[:k])
+        return paths, average_lengths
+def index_to_mask(index, size):
+    index = index.view(-1)
+    size = int(index.max()) + 1 if size is None else size
+    mask = index.new_zeros(size, dtype=torch.bool)
+    mask[index] = True
+    return mask
+def size_to_index(size):
+    range = torch.arange(len(size), device=size.device)
+    index2sample = range.repeat_interleave(size)
+    return index2sample
+def multi_slice_mask(starts, ends, length):
+    values = torch.cat([torch.ones_like(starts), -torch.ones_like(ends)])
+    slices = torch.cat([starts, ends])
+    mask = scatter_add(values, slices, dim=0, dim_size=length + 1)[:-1]
+    mask = mask.cumsum(0).bool()
+    return mask
+def scatter_extend(data, size, input, input_size):
+    new_size = size + input_size
+    new_cum_size = new_size.cumsum(0)
+    new_data = torch.zeros(new_cum_size[-1], *data.shape[1:], dtype=data.dtype, device=data.device)
+    starts = new_cum_size - new_size
+    ends = starts + size
+    index = multi_slice_mask(starts, ends, new_cum_size[-1])
+    new_data[index] = data
+    new_data[~index] = input
+    return new_data, new_size
+def scatter_topk(input, size, k, largest=True):
+    index2graph = size_to_index(size)
+    index2graph = index2graph.view([-1] + [1] * (input.ndim - 1))
+    mask = ~torch.isinf(input)
+    max = input[mask].max().item()
+    min = input[mask].min().item()
+    safe_input = input.clamp(2 * min - max, 2 * max - min)
+    offset = (max - min) * 4
+    if largest:
+        offset = -offset
+    input_ext = safe_input + offset * index2graph
+    index_ext = input_ext.argsort(dim=0, descending=largest)
+    num_actual = size.clamp(max=k)
+    num_padding = k - num_actual
+    starts = size.cumsum(0) - size
+    ends = starts + num_actual
+    mask = multi_slice_mask(starts, ends, len(index_ext)).nonzero().flatten()
+    if (num_padding > 0).any():
+        # special case: size < k, pad with the last valid index
+        padding = ends - 1
+        padding2graph = size_to_index(num_padding)
+        mask = scatter_extend(mask, num_actual, padding[padding2graph], num_padding)[0]
+    index = index_ext[mask] # (N * k, ...)
+    value = input.gather(0, index)
+    if isinstance(k, torch.Tensor) and k.shape == size.shape:
+        value = value.view(-1, *input.shape[1:])
+        index = index.view(-1, *input.shape[1:])
+        index = index - (size.cumsum(0) - size).repeat_interleave(k).view([-1] + [1] * (index.ndim - 1))
+    else:
+        value = value.view(-1, k, *input.shape[1:])
+        index = index.view(-1, k, *input.shape[1:])
+        index = index - (size.cumsum(0) - size).view([-1] + [1] * (index.ndim - 1))
+    return value, index

ultra/datasets.py ADDED Viewed

	@@ -0,0 +1,1095 @@

+import os
+import csv
+import shutil
+import torch
+from torch_geometric.data import Data, InMemoryDataset, download_url, extract_zip
+from torch_geometric.datasets import RelLinkPredDataset, WordNet18RR
+from ultra.tasks import build_relation_graph
+class GrailInductiveDataset(InMemoryDataset):
+    def __init__(self, root, version, transform=None, pre_transform=build_relation_graph, merge_valid_test=True):
+        self.version = version
+        assert version in ["v1", "v2", "v3", "v4"]
+        # by default, most models on Grail datasets merge inductive valid and test splits as the final test split
+        # with this choice, the validation set is that of the transductive train (on the seen graph)
+        # by default it's turned on but you can experiment with turning this option off
+        # you'll need to delete the processed datasets then and re-run to cache a new dataset
+        self.merge_valid_test = merge_valid_test
+        super().__init__(root, transform, pre_transform)
+        self.data, self.slices = torch.load(self.processed_paths[0])
+    @property
+    def num_relations(self):
+        return int(self.data.edge_type.max()) + 1
+    @property
+    def raw_dir(self):
+        return os.path.join(self.root, "grail", self.name, self.version, "raw")
+    @property
+    def processed_dir(self):
+        return os.path.join(self.root, "grail", self.name, self.version, "processed")
+    @property
+    def processed_file_names(self):
+        return "data.pt"
+    @property
+    def raw_file_names(self):
+        return [
+            "train_ind.txt", "valid_ind.txt", "test_ind.txt", "train.txt", "valid.txt"
+        ]
+    def download(self):
+        for url, path in zip(self.urls, self.raw_paths):
+            download_path = download_url(url % self.version, self.raw_dir)
+            os.rename(download_path, path)
+    def process(self):
+        test_files = self.raw_paths[:3]
+        train_files = self.raw_paths[3:]
+        inv_train_entity_vocab = {}
+        inv_test_entity_vocab = {}
+        inv_relation_vocab = {}
+        triplets = []
+        num_samples = []
+        for txt_file in train_files:
+            with open(txt_file, "r") as fin:
+                num_sample = 0
+                for line in fin:
+                    h_token, r_token, t_token = line.strip().split("\t")
+                    if h_token not in inv_train_entity_vocab:
+                        inv_train_entity_vocab[h_token] = len(inv_train_entity_vocab)
+                    h = inv_train_entity_vocab[h_token]
+                    if r_token not in inv_relation_vocab:
+                        inv_relation_vocab[r_token] = len(inv_relation_vocab)
+                    r = inv_relation_vocab[r_token]
+                    if t_token not in inv_train_entity_vocab:
+                        inv_train_entity_vocab[t_token] = len(inv_train_entity_vocab)
+                    t = inv_train_entity_vocab[t_token]
+                    triplets.append((h, t, r))
+                    num_sample += 1
+            num_samples.append(num_sample)
+        for txt_file in test_files:
+            with open(txt_file, "r") as fin:
+                num_sample = 0
+                for line in fin:
+                    h_token, r_token, t_token = line.strip().split("\t")
+                    if h_token not in inv_test_entity_vocab:
+                        inv_test_entity_vocab[h_token] = len(inv_test_entity_vocab)
+                    h = inv_test_entity_vocab[h_token]
+                    assert r_token in inv_relation_vocab
+                    r = inv_relation_vocab[r_token]
+                    if t_token not in inv_test_entity_vocab:
+                        inv_test_entity_vocab[t_token] = len(inv_test_entity_vocab)
+                    t = inv_test_entity_vocab[t_token]
+                    triplets.append((h, t, r))
+                    num_sample += 1
+            num_samples.append(num_sample)
+        triplets = torch.tensor(triplets)
+        edge_index = triplets[:, :2].t()
+        edge_type = triplets[:, 2]
+        num_relations = int(edge_type.max()) + 1
+        # creating fact graphs - those are graphs sent to a model, based on which we'll predict missing facts
+        # also, those fact graphs will be used for filtered evaluation
+        train_fact_slice = slice(None, sum(num_samples[:1]))
+        test_fact_slice = slice(sum(num_samples[:2]), sum(num_samples[:3]))
+        train_fact_index = edge_index[:, train_fact_slice]
+        train_fact_type = edge_type[train_fact_slice]
+        test_fact_index = edge_index[:, test_fact_slice]
+        test_fact_type = edge_type[test_fact_slice]
+        # add flipped triplets for the fact graphs
+        train_fact_index = torch.cat([train_fact_index, train_fact_index.flip(0)], dim=-1)
+        train_fact_type = torch.cat([train_fact_type, train_fact_type + num_relations])
+        test_fact_index = torch.cat([test_fact_index, test_fact_index.flip(0)], dim=-1)
+        test_fact_type = torch.cat([test_fact_type, test_fact_type + num_relations])
+        train_slice = slice(None, sum(num_samples[:1]))
+        valid_slice = slice(sum(num_samples[:1]), sum(num_samples[:2]))
+        # by default, SOTA models on Grail datasets merge inductive valid and test splits as the final test split
+        # with this choice, the validation set is that of the transductive train (on the seen graph)
+        # by default it's turned on but you can experiment with turning this option off
+        test_slice = slice(sum(num_samples[:3]), sum(num_samples)) if self.merge_valid_test else slice(sum(num_samples[:4]), sum(num_samples))
+        train_data = Data(edge_index=train_fact_index, edge_type=train_fact_type, num_nodes=len(inv_train_entity_vocab),
+                          target_edge_index=edge_index[:, train_slice], target_edge_type=edge_type[train_slice], num_relations=num_relations*2)
+        valid_data = Data(edge_index=train_fact_index, edge_type=train_fact_type, num_nodes=len(inv_train_entity_vocab),
+                          target_edge_index=edge_index[:, valid_slice], target_edge_type=edge_type[valid_slice], num_relations=num_relations*2)
+        test_data = Data(edge_index=test_fact_index, edge_type=test_fact_type, num_nodes=len(inv_test_entity_vocab),
+                         target_edge_index=edge_index[:, test_slice], target_edge_type=edge_type[test_slice], num_relations=num_relations*2)
+        if self.pre_transform is not None:
+            train_data = self.pre_transform(train_data)
+            valid_data = self.pre_transform(valid_data)
+            test_data = self.pre_transform(test_data)
+        torch.save((self.collate([train_data, valid_data, test_data])), self.processed_paths[0])
+    def __repr__(self):
+        return "%s(%s)" % (self.name, self.version)
+class FB15k237Inductive(GrailInductiveDataset):
+    urls = [
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/fb237_%s_ind/train.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/fb237_%s_ind/valid.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/fb237_%s_ind/test.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/fb237_%s/train.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/fb237_%s/valid.txt"
+    ]
+    name = "IndFB15k237"
+    def __init__(self, root, version):
+        super().__init__(root, version)
+class WN18RRInductive(GrailInductiveDataset):
+    urls = [
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/WN18RR_%s_ind/train.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/WN18RR_%s_ind/valid.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/WN18RR_%s_ind/test.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/WN18RR_%s/train.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/WN18RR_%s/valid.txt"
+    ]
+    name = "IndWN18RR"
+    def __init__(self, root, version):
+        super().__init__(root, version)
+class NELLInductive(GrailInductiveDataset):
+    urls = [
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/nell_%s_ind/train.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/nell_%s_ind/valid.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/nell_%s_ind/test.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/nell_%s/train.txt",
+        "https://raw.githubusercontent.com/kkteru/grail/master/data/nell_%s/valid.txt"
+    ]
+    name = "IndNELL"
+    def __init__(self, root, version):
+        super().__init__(root, version)
+def FB15k237(root):
+    dataset = RelLinkPredDataset(name="FB15k-237", root=root+"/fb15k237/")
+    data = dataset.data
+    train_data = Data(edge_index=data.edge_index, edge_type=data.edge_type, num_nodes=data.num_nodes,
+                        target_edge_index=data.train_edge_index, target_edge_type=data.train_edge_type,
+                        num_relations=dataset.num_relations)
+    valid_data = Data(edge_index=data.edge_index, edge_type=data.edge_type, num_nodes=data.num_nodes,
+                        target_edge_index=data.valid_edge_index, target_edge_type=data.valid_edge_type,
+                        num_relations=dataset.num_relations)
+    test_data = Data(edge_index=data.edge_index, edge_type=data.edge_type, num_nodes=data.num_nodes,
+                        target_edge_index=data.test_edge_index, target_edge_type=data.test_edge_type,
+                        num_relations=dataset.num_relations)
+    # build relation graphs
+    train_data = build_relation_graph(train_data)
+    valid_data = build_relation_graph(valid_data)
+    test_data = build_relation_graph(test_data)
+    dataset.data, dataset.slices = dataset.collate([train_data, valid_data, test_data])
+    return dataset
+def WN18RR(root):
+    dataset = WordNet18RR(root=root+"/wn18rr/")
+    # convert wn18rr into the same format as fb15k-237
+    data = dataset.data
+    num_nodes = int(data.edge_index.max()) + 1
+    num_relations = int(data.edge_type.max()) + 1
+    edge_index = data.edge_index[:, data.train_mask]
+    edge_type = data.edge_type[data.train_mask]
+    edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=-1)
+    edge_type = torch.cat([edge_type, edge_type + num_relations])
+    train_data = Data(edge_index=edge_index, edge_type=edge_type, num_nodes=num_nodes,
+                        target_edge_index=data.edge_index[:, data.train_mask],
+                        target_edge_type=data.edge_type[data.train_mask],
+                        num_relations=num_relations*2)
+    valid_data = Data(edge_index=edge_index, edge_type=edge_type, num_nodes=num_nodes,
+                        target_edge_index=data.edge_index[:, data.val_mask],
+                        target_edge_type=data.edge_type[data.val_mask],
+                        num_relations=num_relations*2)
+    test_data = Data(edge_index=edge_index, edge_type=edge_type, num_nodes=num_nodes,
+                        target_edge_index=data.edge_index[:, data.test_mask],
+                        target_edge_type=data.edge_type[data.test_mask],
+                        num_relations=num_relations*2)
+    # build relation graphs
+    train_data = build_relation_graph(train_data)
+    valid_data = build_relation_graph(valid_data)
+    test_data = build_relation_graph(test_data)
+    dataset.data, dataset.slices = dataset.collate([train_data, valid_data, test_data])
+    dataset.num_relations = num_relations * 2
+    return dataset
+class TransductiveDataset(InMemoryDataset):
+    delimiter = None
+    def __init__(self, root, transform=None, pre_transform=build_relation_graph, **kwargs):
+        super().__init__(root, transform, pre_transform)
+        self.data, self.slices = torch.load(self.processed_paths[0])
+    @property
+    def raw_file_names(self):
+        return ["train.txt", "valid.txt", "test.txt"]
+    def download(self):
+        for url, path in zip(self.urls, self.raw_paths):
+            download_path = download_url(url, self.raw_dir)
+            os.rename(download_path, path)
+    def load_file(self, triplet_file, inv_entity_vocab={}, inv_rel_vocab={}):
+        triplets = []
+        entity_cnt, rel_cnt = len(inv_entity_vocab), len(inv_rel_vocab)
+        with open(triplet_file, "r", encoding="utf-8") as fin:
+            for l in fin:
+                u, r, v = l.split() if self.delimiter is None else l.strip().split(self.delimiter)
+                if u not in inv_entity_vocab:
+                    inv_entity_vocab[u] = entity_cnt
+                    entity_cnt += 1
+                if v not in inv_entity_vocab:
+                    inv_entity_vocab[v] = entity_cnt
+                    entity_cnt += 1
+                if r not in inv_rel_vocab:
+                    inv_rel_vocab[r] = rel_cnt
+                    rel_cnt += 1
+                u, r, v = inv_entity_vocab[u], inv_rel_vocab[r], inv_entity_vocab[v]
+                triplets.append((u, v, r))
+        return {
+            "triplets": triplets,
+            "num_node": len(inv_entity_vocab), #entity_cnt,
+            "num_relation": rel_cnt,
+            "inv_entity_vocab": inv_entity_vocab,
+            "inv_rel_vocab": inv_rel_vocab
+        }
+    # default loading procedure: process train/valid/test files, create graphs from them
+    def process(self):
+        train_files = self.raw_paths[:3]
+        train_results = self.load_file(train_files[0], inv_entity_vocab={}, inv_rel_vocab={})
+        valid_results = self.load_file(train_files[1],
+                        train_results["inv_entity_vocab"], train_results["inv_rel_vocab"])
+        test_results = self.load_file(train_files[2],
+                        train_results["inv_entity_vocab"], train_results["inv_rel_vocab"])
+        # in some datasets, there are several new nodes in the test set, eg 123,143 YAGO train adn 123,182 in YAGO test
+        # for consistency with other experimental results, we'll include those in the full vocab and num nodes
+        num_node = test_results["num_node"]
+        # the same for rels: in most cases train == test for transductive
+        # for AristoV4 train rels 1593, test 1604
+        num_relations = test_results["num_relation"]
+        train_triplets = train_results["triplets"]
+        valid_triplets = valid_results["triplets"]
+        test_triplets = test_results["triplets"]
+        train_target_edges = torch.tensor([[t[0], t[1]] for t in train_triplets], dtype=torch.long).t()
+        train_target_etypes = torch.tensor([t[2] for t in train_triplets])
+        valid_edges = torch.tensor([[t[0], t[1]] for t in valid_triplets], dtype=torch.long).t()
+        valid_etypes = torch.tensor([t[2] for t in valid_triplets])
+        test_edges = torch.tensor([[t[0], t[1]] for t in test_triplets], dtype=torch.long).t()
+        test_etypes = torch.tensor([t[2] for t in test_triplets])
+        train_edges = torch.cat([train_target_edges, train_target_edges.flip(0)], dim=1)
+        train_etypes = torch.cat([train_target_etypes, train_target_etypes+num_relations])
+        train_data = Data(edge_index=train_edges, edge_type=train_etypes, num_nodes=num_node,
+                          target_edge_index=train_target_edges, target_edge_type=train_target_etypes, num_relations=num_relations*2)
+        valid_data = Data(edge_index=train_edges, edge_type=train_etypes, num_nodes=num_node,
+                          target_edge_index=valid_edges, target_edge_type=valid_etypes, num_relations=num_relations*2)
+        test_data = Data(edge_index=train_edges, edge_type=train_etypes, num_nodes=num_node,
+                         target_edge_index=test_edges, target_edge_type=test_etypes, num_relations=num_relations*2)
+        # build graphs of relations
+        if self.pre_transform is not None:
+            train_data = self.pre_transform(train_data)
+            valid_data = self.pre_transform(valid_data)
+            test_data = self.pre_transform(test_data)
+        torch.save((self.collate([train_data, valid_data, test_data])), self.processed_paths[0])
+    def __repr__(self):
+        return "%s()" % (self.name)
+    @property
+    def num_relations(self):
+        return int(self.data.edge_type.max()) + 1
+    @property
+    def raw_dir(self):
+        return os.path.join(self.root, self.name, "raw")
+    @property
+    def processed_dir(self):
+        return os.path.join(self.root, self.name, "processed")
+    @property
+    def processed_file_names(self):
+        return "data.pt"
+class CoDEx(TransductiveDataset):
+    name = "codex"
+    urls = [
+        "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/%s/train.txt",
+        "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/%s/valid.txt",
+        "https://raw.githubusercontent.com/tsafavi/codex/master/data/triples/%s/test.txt",
+    ]
+    def download(self):
+        for url, path in zip(self.urls, self.raw_paths):
+            download_path = download_url(url % self.name, self.raw_dir)
+            os.rename(download_path, path)
+class CoDExSmall(CoDEx):
+    """
+    #node: 2034
+    #edge: 36543
+    #relation: 42
+    """
+    url = "https://zenodo.org/record/4281094/files/codex-s.tar.gz"
+    md5 = "63cd8186fc2aeddc154e20cf4a10087e"
+    name = "codex-s"
+    def __init__(self, root):
+        super(CoDExSmall, self).__init__(root=root, size='s')
+class CoDExMedium(CoDEx):
+    """
+    #node: 17050
+    #edge: 206205
+    #relation: 51
+    """
+    url = "https://zenodo.org/record/4281094/files/codex-m.tar.gz"
+    md5 = "43e561cfdca1c6ad9cc2f5b1ca4add76"
+    name = "codex-m"
+    def __init__(self, root):
+        super(CoDExMedium, self).__init__(root=root, size='m')
+class CoDExLarge(CoDEx):
+    """
+    #node: 77951
+    #edge: 612437
+    #relation: 69
+    """
+    url = "https://zenodo.org/record/4281094/files/codex-l.tar.gz"
+    md5 = "9a10f4458c4bd2b16ef9b92b677e0d71"
+    name = "codex-l"
+    def __init__(self, root):
+        super(CoDExLarge, self).__init__(root=root, size='l')
+class NELL995(TransductiveDataset):
+    # from the RED-GNN paper https://github.com/LARS-research/RED-GNN/tree/main/transductive/data/nell
+    # the OG dumps were found to have test set leakages
+    # training set is made out of facts+train files, so we sum up their samples to build one training graph
+    urls = [
+        "https://raw.githubusercontent.com/LARS-research/RED-GNN/main/transductive/data/nell/facts.txt",
+        "https://raw.githubusercontent.com/LARS-research/RED-GNN/main/transductive/data/nell/train.txt",
+        "https://raw.githubusercontent.com/LARS-research/RED-GNN/main/transductive/data/nell/valid.txt",
+        "https://raw.githubusercontent.com/LARS-research/RED-GNN/main/transductive/data/nell/test.txt",
+    ]
+    name = "nell995"
+    @property
+    def raw_file_names(self):
+        return ["facts.txt", "train.txt", "valid.txt", "test.txt"]
+    def process(self):
+        train_files = self.raw_paths[:4]
+        facts_results = self.load_file(train_files[0], inv_entity_vocab={}, inv_rel_vocab={})
+        train_results = self.load_file(train_files[1], facts_results["inv_entity_vocab"], facts_results["inv_rel_vocab"])
+        valid_results = self.load_file(train_files[2], train_results["inv_entity_vocab"], train_results["inv_rel_vocab"])
+        test_results = self.load_file(train_files[3], train_results["inv_entity_vocab"], train_results["inv_rel_vocab"])
+        num_node = valid_results["num_node"]
+        num_relations = train_results["num_relation"]
+        train_triplets = facts_results["triplets"] + train_results["triplets"]
+        valid_triplets = valid_results["triplets"]
+        test_triplets = test_results["triplets"]
+        train_target_edges = torch.tensor([[t[0], t[1]] for t in train_triplets], dtype=torch.long).t()
+        train_target_etypes = torch.tensor([t[2] for t in train_triplets])
+        valid_edges = torch.tensor([[t[0], t[1]] for t in valid_triplets], dtype=torch.long).t()
+        valid_etypes = torch.tensor([t[2] for t in valid_triplets])
+        test_edges = torch.tensor([[t[0], t[1]] for t in test_triplets], dtype=torch.long).t()
+        test_etypes = torch.tensor([t[2] for t in test_triplets])
+        train_edges = torch.cat([train_target_edges, train_target_edges.flip(0)], dim=1)
+        train_etypes = torch.cat([train_target_etypes, train_target_etypes+num_relations])
+        train_data = Data(edge_index=train_edges, edge_type=train_etypes, num_nodes=num_node,
+                          target_edge_index=train_target_edges, target_edge_type=train_target_etypes, num_relations=num_relations*2)
+        valid_data = Data(edge_index=train_edges, edge_type=train_etypes, num_nodes=num_node,
+                          target_edge_index=valid_edges, target_edge_type=valid_etypes, num_relations=num_relations*2)
+        test_data = Data(edge_index=train_edges, edge_type=train_etypes, num_nodes=num_node,
+                         target_edge_index=test_edges, target_edge_type=test_etypes, num_relations=num_relations*2)
+        # build graphs of relations
+        if self.pre_transform is not None:
+            train_data = self.pre_transform(train_data)
+            valid_data = self.pre_transform(valid_data)
+            test_data = self.pre_transform(test_data)
+        torch.save((self.collate([train_data, valid_data, test_data])), self.processed_paths[0])
+class ConceptNet100k(TransductiveDataset):
+    urls = [
+        "https://raw.githubusercontent.com/guojiapub/BiQUE/master/src_data/conceptnet-100k/train",
+        "https://raw.githubusercontent.com/guojiapub/BiQUE/master/src_data/conceptnet-100k/valid",
+        "https://raw.githubusercontent.com/guojiapub/BiQUE/master/src_data/conceptnet-100k/test",
+    ]
+    name = "cnet100k"
+    delimiter = "\t"
+class DBpedia100k(TransductiveDataset):
+    urls = [
+        "https://raw.githubusercontent.com/iieir-km/ComplEx-NNE_AER/master/datasets/DB100K/_train.txt",
+        "https://raw.githubusercontent.com/iieir-km/ComplEx-NNE_AER/master/datasets/DB100K/_valid.txt",
+        "https://raw.githubusercontent.com/iieir-km/ComplEx-NNE_AER/master/datasets/DB100K/_test.txt",
+        ]
+    name = "dbp100k"
+class YAGO310(TransductiveDataset):
+    urls = [
+        "https://raw.githubusercontent.com/DeepGraphLearning/KnowledgeGraphEmbedding/master/data/YAGO3-10/train.txt",
+        "https://raw.githubusercontent.com/DeepGraphLearning/KnowledgeGraphEmbedding/master/data/YAGO3-10/valid.txt",
+        "https://raw.githubusercontent.com/DeepGraphLearning/KnowledgeGraphEmbedding/master/data/YAGO3-10/test.txt",
+        ]
+    name = "yago310"
+class Hetionet(TransductiveDataset):
+    urls = [
+        "https://www.dropbox.com/s/y47bt9oq57h6l5k/train.txt?dl=1",
+        "https://www.dropbox.com/s/a0pbrx9tz3dgsff/valid.txt?dl=1",
+        "https://www.dropbox.com/s/4dhrvg3fyq5tnu4/test.txt?dl=1",
+        ]
+    name = "hetionet"
+class AristoV4(TransductiveDataset):
+    url = "https://zenodo.org/record/5942560/files/aristo-v4.zip"
+    name = "aristov4"
+    delimiter = "\t"
+    def download(self):
+        download_path = download_url(self.url, self.raw_dir)
+        extract_zip(download_path, self.raw_dir)
+        os.unlink(download_path)
+        for oldname, newname in zip(['train', 'valid', 'test'], self.raw_paths):
+            os.rename(os.path.join(self.raw_dir, oldname), newname)
+class SparserKG(TransductiveDataset):
+    # 5 datasets based on FB/NELL/WD, introduced in https://github.com/THU-KEG/DacKGR
+    # re-writing the loading function because dumps are in the format (h, t, r) while the standard is (h, r, t)
+    url = "https://raw.githubusercontent.com/THU-KEG/DacKGR/master/data.zip"
+    delimiter = "\t"
+    base_name = "SparseKG"
+    @property
+    def raw_dir(self):
+        return os.path.join(self.root, self.base_name, self.name, "raw")
+    @property
+    def processed_dir(self):
+        return os.path.join(self.root, self.base_name, self.name, "processed")
+    def download(self):
+        base_path = os.path.join(self.root, self.base_name)
+        download_path = download_url(self.url, base_path)
+        extract_zip(download_path, base_path)
+        for dsname in ['NELL23K', 'WD-singer', 'FB15K-237-10', 'FB15K-237-20', 'FB15K-237-50']:
+            for oldname, newname in zip(['train.triples', 'dev.triples', 'test.triples'], self.raw_file_names):
+                os.renames(os.path.join(base_path, "data", dsname, oldname), os.path.join(base_path, dsname, "raw", newname))
+        shutil.rmtree(os.path.join(base_path, "data"))
+    def load_file(self, triplet_file, inv_entity_vocab={}, inv_rel_vocab={}):
+        triplets = []
+        entity_cnt, rel_cnt = len(inv_entity_vocab), len(inv_rel_vocab)
+        with open(triplet_file, "r", encoding="utf-8") as fin:
+            for l in fin:
+                u, v, r = l.split() if self.delimiter is None else l.strip().split(self.delimiter)
+                if u not in inv_entity_vocab:
+                    inv_entity_vocab[u] = entity_cnt
+                    entity_cnt += 1
+                if v not in inv_entity_vocab:
+                    inv_entity_vocab[v] = entity_cnt
+                    entity_cnt += 1
+                if r not in inv_rel_vocab:
+                    inv_rel_vocab[r] = rel_cnt
+                    rel_cnt += 1
+                u, r, v = inv_entity_vocab[u], inv_rel_vocab[r], inv_entity_vocab[v]
+                triplets.append((u, v, r))
+        return {
+            "triplets": triplets,
+            "num_node": len(inv_entity_vocab), #entity_cnt,
+            "num_relation": rel_cnt,
+            "inv_entity_vocab": inv_entity_vocab,
+            "inv_rel_vocab": inv_rel_vocab
+        }
+class WDsinger(SparserKG):
+    name = "WD-singer"
+class NELL23k(SparserKG):
+    name = "NELL23K"
+class FB15k237_10(SparserKG):
+    name = "FB15K-237-10"
+class FB15k237_20(SparserKG):
+    name = "FB15K-237-20"
+class FB15k237_50(SparserKG):
+    name = "FB15K-237-50"
+class InductiveDataset(InMemoryDataset):
+    delimiter = None
+    # some datasets (4 from Hamaguchi et al and Indigo) have validation set based off the train graph, not inference
+    valid_on_inf = True  #
+    def __init__(self, root, version, transform=None, pre_transform=build_relation_graph, **kwargs):
+        self.version = str(version)
+        super().__init__(root, transform, pre_transform)
+        self.data, self.slices = torch.load(self.processed_paths[0])
+    def download(self):
+        for url, path in zip(self.urls, self.raw_paths):
+            download_path = download_url(url % self.version, self.raw_dir)
+            os.rename(download_path, path)
+    def load_file(self, triplet_file, inv_entity_vocab={}, inv_rel_vocab={}):
+        triplets = []
+        entity_cnt, rel_cnt = len(inv_entity_vocab), len(inv_rel_vocab)
+        with open(triplet_file, "r", encoding="utf-8") as fin:
+            for l in fin:
+                u, r, v = l.split() if self.delimiter is None else l.strip().split(self.delimiter)
+                if u not in inv_entity_vocab:
+                    inv_entity_vocab[u] = entity_cnt
+                    entity_cnt += 1
+                if v not in inv_entity_vocab:
+                    inv_entity_vocab[v] = entity_cnt
+                    entity_cnt += 1
+                if r not in inv_rel_vocab:
+                    inv_rel_vocab[r] = rel_cnt
+                    rel_cnt += 1
+                u, r, v = inv_entity_vocab[u], inv_rel_vocab[r], inv_entity_vocab[v]
+                triplets.append((u, v, r))
+        return {
+            "triplets": triplets,
+            "num_node": len(inv_entity_vocab), #entity_cnt,
+            "num_relation": rel_cnt,
+            "inv_entity_vocab": inv_entity_vocab,
+            "inv_rel_vocab": inv_rel_vocab
+        }
+    def process(self):
+        train_files = self.raw_paths[:4]
+        train_res = self.load_file(train_files[0], inv_entity_vocab={}, inv_rel_vocab={})
+        inference_res = self.load_file(train_files[1], inv_entity_vocab={}, inv_rel_vocab={})
+        valid_res = self.load_file(
+            train_files[2],
+            inference_res["inv_entity_vocab"] if self.valid_on_inf else train_res["inv_entity_vocab"],
+            inference_res["inv_rel_vocab"] if self.valid_on_inf else train_res["inv_rel_vocab"]
+        )
+        test_res = self.load_file(train_files[3], inference_res["inv_entity_vocab"], inference_res["inv_rel_vocab"])
+        num_train_nodes, num_train_rels = train_res["num_node"], train_res["num_relation"]
+        inference_num_nodes, inference_num_rels = test_res["num_node"], test_res["num_relation"]
+        train_edges, inf_graph, inf_valid_edges, inf_test_edges = train_res["triplets"], inference_res["triplets"], valid_res["triplets"], test_res["triplets"]
+        train_target_edges = torch.tensor([[t[0], t[1]] for t in train_edges], dtype=torch.long).t()
+        train_target_etypes = torch.tensor([t[2] for t in train_edges])
+        train_fact_index = torch.cat([train_target_edges, train_target_edges.flip(0)], dim=1)
+        train_fact_type = torch.cat([train_target_etypes, train_target_etypes + num_train_rels])
+        inf_edges = torch.tensor([[t[0], t[1]] for t in inf_graph], dtype=torch.long).t()
+        inf_edges = torch.cat([inf_edges, inf_edges.flip(0)], dim=1)
+        inf_etypes = torch.tensor([t[2] for t in inf_graph])
+        inf_etypes = torch.cat([inf_etypes, inf_etypes + inference_num_rels])
+        inf_valid_edges = torch.tensor(inf_valid_edges, dtype=torch.long)
+        inf_test_edges = torch.tensor(inf_test_edges, dtype=torch.long)
+        train_data = Data(edge_index=train_fact_index, edge_type=train_fact_type, num_nodes=num_train_nodes,
+                          target_edge_index=train_target_edges, target_edge_type=train_target_etypes, num_relations=num_train_rels*2)
+        valid_data = Data(edge_index=inf_edges if self.valid_on_inf else train_fact_index,
+                          edge_type=inf_etypes if self.valid_on_inf else train_fact_type,
+                          num_nodes=inference_num_nodes if self.valid_on_inf else num_train_nodes,
+                          target_edge_index=inf_valid_edges[:, :2].T,
+                          target_edge_type=inf_valid_edges[:, 2],
+                          num_relations=inference_num_rels*2 if self.valid_on_inf else num_train_rels*2)
+        test_data = Data(edge_index=inf_edges, edge_type=inf_etypes, num_nodes=inference_num_nodes,
+                         target_edge_index=inf_test_edges[:, :2].T, target_edge_type=inf_test_edges[:, 2], num_relations=inference_num_rels*2)
+        if self.pre_transform is not None:
+            train_data = self.pre_transform(train_data)
+            valid_data = self.pre_transform(valid_data)
+            test_data = self.pre_transform(test_data)
+        torch.save((self.collate([train_data, valid_data, test_data])), self.processed_paths[0])
+    @property
+    def num_relations(self):
+        return int(self.data.edge_type.max()) + 1
+    @property
+    def raw_dir(self):
+        return os.path.join(self.root, self.name, self.version, "raw")
+    @property
+    def processed_dir(self):
+        return os.path.join(self.root, self.name, self.version, "processed")
+    @property
+    def raw_file_names(self):
+        return [
+            "transductive_train.txt", "inference_graph.txt", "inf_valid.txt", "inf_test.txt"
+        ]
+    @property
+    def processed_file_names(self):
+        return "data.pt"
+    def __repr__(self):
+        return "%s(%s)" % (self.name, self.version)
+class IngramInductive(InductiveDataset):
+    @property
+    def raw_dir(self):
+        return os.path.join(self.root, "ingram", self.name, self.version, "raw")
+    @property
+    def processed_dir(self):
+        return os.path.join(self.root, "ingram", self.name, self.version, "processed")
+class FBIngram(IngramInductive):
+    urls = [
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/FB-%s/train.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/FB-%s/msg.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/FB-%s/valid.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/FB-%s/test.txt",
+    ]
+    name = "fb"
+class WKIngram(IngramInductive):
+    urls = [
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/WK-%s/train.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/WK-%s/msg.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/WK-%s/valid.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/WK-%s/test.txt",
+    ]
+    name = "wk"
+class NLIngram(IngramInductive):
+    urls = [
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/NL-%s/train.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/NL-%s/msg.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/NL-%s/valid.txt",
+        "https://raw.githubusercontent.com/bdi-lab/InGram/master/data/NL-%s/test.txt",
+    ]
+    name = "nl"
+class ILPC2022(InductiveDataset):
+    urls = [
+        "https://raw.githubusercontent.com/pykeen/ilpc2022/master/data/%s/train.txt",
+        "https://raw.githubusercontent.com/pykeen/ilpc2022/master/data/%s/inference.txt",
+        "https://raw.githubusercontent.com/pykeen/ilpc2022/master/data/%s/inference_validation.txt",
+        "https://raw.githubusercontent.com/pykeen/ilpc2022/master/data/%s/inference_test.txt",
+    ]
+    name = "ilpc2022"
+class HM(InductiveDataset):
+    # benchmarks from Hamaguchi et al and Indigo BM
+    urls = [
+        "https://raw.githubusercontent.com/shuwen-liu-ox/INDIGO/master/data/%s/train/train.txt",
+        "https://raw.githubusercontent.com/shuwen-liu-ox/INDIGO/master/data/%s/test/test-graph.txt",
+        "https://raw.githubusercontent.com/shuwen-liu-ox/INDIGO/master/data/%s/train/valid.txt",
+        "https://raw.githubusercontent.com/shuwen-liu-ox/INDIGO/master/data/%s/test/test-fact.txt",
+    ]
+    name = "hm"
+    versions = {
+        '1k': "Hamaguchi-BM_both-1000",
+        '3k': "Hamaguchi-BM_both-3000",
+        '5k': "Hamaguchi-BM_both-5000",
+        'indigo': "INDIGO-BM"
+    }
+    # in 4 HM graphs, the validation set is based off the training graph, so we'll adjust the dataset creation accordingly
+    valid_on_inf = False
+    def __init__(self, root, version, **kwargs):
+        version = self.versions[version]
+        super().__init__(root, version, **kwargs)
+    # HM datasets are a bit weird: validation set (based off the train graph) has a few hundred new nodes, so we need a custom processing
+    def process(self):
+        train_files = self.raw_paths[:4]
+        train_res = self.load_file(train_files[0], inv_entity_vocab={}, inv_rel_vocab={})
+        inference_res = self.load_file(train_files[1], inv_entity_vocab={}, inv_rel_vocab={})
+        valid_res = self.load_file(
+            train_files[2],
+            inference_res["inv_entity_vocab"] if self.valid_on_inf else train_res["inv_entity_vocab"],
+            inference_res["inv_rel_vocab"] if self.valid_on_inf else train_res["inv_rel_vocab"]
+        )
+        test_res = self.load_file(train_files[3], inference_res["inv_entity_vocab"], inference_res["inv_rel_vocab"])
+        num_train_nodes, num_train_rels = train_res["num_node"], train_res["num_relation"]
+        inference_num_nodes, inference_num_rels = test_res["num_node"], test_res["num_relation"]
+        train_edges, inf_graph, inf_valid_edges, inf_test_edges = train_res["triplets"], inference_res["triplets"], valid_res["triplets"], test_res["triplets"]
+        train_target_edges = torch.tensor([[t[0], t[1]] for t in train_edges], dtype=torch.long).t()
+        train_target_etypes = torch.tensor([t[2] for t in train_edges])
+        train_fact_index = torch.cat([train_target_edges, train_target_edges.flip(0)], dim=1)
+        train_fact_type = torch.cat([train_target_etypes, train_target_etypes + num_train_rels])
+        inf_edges = torch.tensor([[t[0], t[1]] for t in inf_graph], dtype=torch.long).t()
+        inf_edges = torch.cat([inf_edges, inf_edges.flip(0)], dim=1)
+        inf_etypes = torch.tensor([t[2] for t in inf_graph])
+        inf_etypes = torch.cat([inf_etypes, inf_etypes + inference_num_rels])
+        inf_valid_edges = torch.tensor(inf_valid_edges, dtype=torch.long)
+        inf_test_edges = torch.tensor(inf_test_edges, dtype=torch.long)
+        train_data = Data(edge_index=train_fact_index, edge_type=train_fact_type, num_nodes=num_train_nodes,
+                          target_edge_index=train_target_edges, target_edge_type=train_target_etypes, num_relations=num_train_rels*2)
+        valid_data = Data(edge_index=train_fact_index,
+                          edge_type=train_fact_type,
+                          num_nodes=valid_res["num_node"],  # the only fix in this function
+                          target_edge_index=inf_valid_edges[:, :2].T,
+                          target_edge_type=inf_valid_edges[:, 2],
+                          num_relations=inference_num_rels*2 if self.valid_on_inf else num_train_rels*2)
+        test_data = Data(edge_index=inf_edges, edge_type=inf_etypes, num_nodes=inference_num_nodes,
+                         target_edge_index=inf_test_edges[:, :2].T, target_edge_type=inf_test_edges[:, 2], num_relations=inference_num_rels*2)
+        if self.pre_transform is not None:
+            train_data = self.pre_transform(train_data)
+            valid_data = self.pre_transform(valid_data)
+            test_data = self.pre_transform(test_data)
+        torch.save((self.collate([train_data, valid_data, test_data])), self.processed_paths[0])
+class MTDEAInductive(InductiveDataset):
+    valid_on_inf = False
+    url = "https://reltrans.s3.us-east-2.amazonaws.com/MTDEA_data.zip"
+    base_name = "mtdea"
+    def __init__(self, root, version, **kwargs):
+        assert version in self.versions, f"unknown version {version} for {self.name}, available: {self.versions}"
+        super().__init__(root, version, **kwargs)
+    @property
+    def raw_dir(self):
+        return os.path.join(self.root, self.base_name, self.name, self.version, "raw")
+    @property
+    def processed_dir(self):
+        return os.path.join(self.root, self.base_name, self.name, self.version, "processed")
+    @property
+    def raw_file_names(self):
+        return [
+            "transductive_train.txt", "inference_graph.txt", "transductive_valid.txt", "inf_test.txt"
+        ]
+    def download(self):
+        base_path = os.path.join(self.root, self.base_name)
+        download_path = download_url(self.url, base_path)
+        extract_zip(download_path, base_path)
+        # unzip all datasets at once
+        for dsname in ['FBNELL', 'Metafam', 'WikiTopics-MT1', 'WikiTopics-MT2', 'WikiTopics-MT3', 'WikiTopics-MT4']:
+            cl = globals()[dsname.replace("-","")]
+            versions = cl.versions
+            for version in versions:
+                for oldname, newname in zip(['train.txt', 'observe.txt', 'valid.txt', 'test.txt'], self.raw_file_names):
+                    foldername = cl.prefix % version + "-trans" if "transductive" in newname else cl.prefix % version + "-ind"
+                    os.renames(
+                        os.path.join(base_path, "MTDEA_datasets", dsname, foldername, oldname),
+                        os.path.join(base_path, dsname, version, "raw", newname)
+                    )
+        shutil.rmtree(os.path.join(base_path, "MTDEA_datasets"))
+    def load_file(self, triplet_file, inv_entity_vocab={}, inv_rel_vocab={}, limit_vocab=False):
+        triplets = []
+        entity_cnt, rel_cnt = len(inv_entity_vocab), len(inv_rel_vocab)
+        # limit_vocab is for dropping triples with unseen head/tail not seen in the main entity_vocab
+        # can be used for FBNELL and MT3:art, other datasets seem to be ok and share num_nodes/num_relations in the train/inference graph
+        with open(triplet_file, "r", encoding="utf-8") as fin:
+            for l in fin:
+                u, r, v = l.split() if self.delimiter is None else l.strip().split(self.delimiter)
+                if u not in inv_entity_vocab:
+                    if limit_vocab:
+                        continue
+                    inv_entity_vocab[u] = entity_cnt
+                    entity_cnt += 1
+                if v not in inv_entity_vocab:
+                    if limit_vocab:
+                        continue
+                    inv_entity_vocab[v] = entity_cnt
+                    entity_cnt += 1
+                if r not in inv_rel_vocab:
+                    if limit_vocab:
+                        continue
+                    inv_rel_vocab[r] = rel_cnt
+                    rel_cnt += 1
+                u, r, v = inv_entity_vocab[u], inv_rel_vocab[r], inv_entity_vocab[v]
+                triplets.append((u, v, r))
+        return {
+            "triplets": triplets,
+            "num_node": entity_cnt,
+            "num_relation": rel_cnt,
+            "inv_entity_vocab": inv_entity_vocab,
+            "inv_rel_vocab": inv_rel_vocab
+        }
+    # special processes for MTDEA datasets for one particular fix in the validation set loading
+    def process(self):
+        train_files = self.raw_paths[:4]
+        train_res = self.load_file(train_files[0], inv_entity_vocab={}, inv_rel_vocab={})
+        inference_res = self.load_file(train_files[1], inv_entity_vocab={}, inv_rel_vocab={})
+        valid_res = self.load_file(
+            train_files[2],
+            inference_res["inv_entity_vocab"] if self.valid_on_inf else train_res["inv_entity_vocab"],
+            inference_res["inv_rel_vocab"] if self.valid_on_inf else train_res["inv_rel_vocab"],
+            limit_vocab=True,  # the 1st fix in this function compared to the superclass processor
+        )
+        test_res = self.load_file(train_files[3], inference_res["inv_entity_vocab"], inference_res["inv_rel_vocab"])
+        num_train_nodes, num_train_rels = train_res["num_node"], train_res["num_relation"]
+        inference_num_nodes, inference_num_rels = test_res["num_node"], test_res["num_relation"]
+        train_edges, inf_graph, inf_valid_edges, inf_test_edges = train_res["triplets"], inference_res["triplets"], valid_res["triplets"], test_res["triplets"]
+        train_target_edges = torch.tensor([[t[0], t[1]] for t in train_edges], dtype=torch.long).t()
+        train_target_etypes = torch.tensor([t[2] for t in train_edges])
+        train_fact_index = torch.cat([train_target_edges, train_target_edges.flip(0)], dim=1)
+        train_fact_type = torch.cat([train_target_etypes, train_target_etypes + num_train_rels])
+        inf_edges = torch.tensor([[t[0], t[1]] for t in inf_graph], dtype=torch.long).t()
+        inf_edges = torch.cat([inf_edges, inf_edges.flip(0)], dim=1)
+        inf_etypes = torch.tensor([t[2] for t in inf_graph])
+        inf_etypes = torch.cat([inf_etypes, inf_etypes + inference_num_rels])
+        inf_valid_edges = torch.tensor(inf_valid_edges, dtype=torch.long)
+        inf_test_edges = torch.tensor(inf_test_edges, dtype=torch.long)
+        train_data = Data(edge_index=train_fact_index, edge_type=train_fact_type, num_nodes=num_train_nodes,
+                        target_edge_index=train_target_edges, target_edge_type=train_target_etypes, num_relations=num_train_rels*2)
+        valid_data = Data(edge_index=train_fact_index,
+                        edge_type=train_fact_type,
+                        num_nodes=valid_res["num_node"],  # the 2nd fix in this function
+                        target_edge_index=inf_valid_edges[:, :2].T,
+                        target_edge_type=inf_valid_edges[:, 2],
+                        num_relations=inference_num_rels*2 if self.valid_on_inf else num_train_rels*2)
+        test_data = Data(edge_index=inf_edges, edge_type=inf_etypes, num_nodes=inference_num_nodes,
+                        target_edge_index=inf_test_edges[:, :2].T, target_edge_type=inf_test_edges[:, 2], num_relations=inference_num_rels*2)
+        if self.pre_transform is not None:
+            train_data = self.pre_transform(train_data)
+            valid_data = self.pre_transform(valid_data)
+            test_data = self.pre_transform(test_data)
+        torch.save((self.collate([train_data, valid_data, test_data])), self.processed_paths[0])
+class FBNELL(MTDEAInductive):
+    name = "FBNELL"
+    prefix = "%s"
+    versions = ["FBNELL_v1"]
+    def __init__(self, **kwargs):
+        kwargs.pop("version")
+        kwargs['version'] = self.versions[0]
+        super(FBNELL, self).__init__(**kwargs)
+class Metafam(MTDEAInductive):
+    name = "Metafam"
+    prefix = "%s"
+    versions = ["Metafam"]
+    def __init__(self, **kwargs):
+        kwargs.pop("version")
+        kwargs['version'] = self.versions[0]
+        super(Metafam, self).__init__(**kwargs)
+class WikiTopicsMT1(MTDEAInductive):
+    name = "WikiTopics-MT1"
+    prefix = "wikidata_%sv1"
+    versions = ['mt', 'health', 'tax']
+    def __init__(self, **kwargs):
+        assert kwargs['version'] in self.versions, f"unknown version {kwargs['version']}, available: {self.versions}"
+        super(WikiTopicsMT1, self).__init__(**kwargs)
+class WikiTopicsMT2(MTDEAInductive):
+    name = "WikiTopics-MT2"
+    prefix = "wikidata_%sv1"
+    versions = ['mt2', 'org', 'sci']
+    def __init__(self, **kwargs):
+        super(WikiTopicsMT2, self).__init__(**kwargs)
+class WikiTopicsMT3(MTDEAInductive):
+    name = "WikiTopics-MT3"
+    prefix = "wikidata_%sv2"
+    versions = ['mt3', 'art', 'infra']
+    def __init__(self, **kwargs):
+        super(WikiTopicsMT3, self).__init__(**kwargs)
+class WikiTopicsMT4(MTDEAInductive):
+    name = "WikiTopics-MT4"
+    prefix = "wikidata_%sv2"
+    versions = ['mt4', 'sci', 'health']
+    def __init__(self, **kwargs):
+        super(WikiTopicsMT4, self).__init__(**kwargs)
+# a joint dataset for pre-training ULTRA on several graphs
+class JointDataset(InMemoryDataset):
+    datasets_map = {
+        'FB15k237': FB15k237,
+        'WN18RR': WN18RR,
+        'CoDExSmall': CoDExSmall,
+        'CoDExMedium': CoDExMedium,
+        'CoDExLarge': CoDExLarge,
+        'NELL995': NELL995,
+        'ConceptNet100k': ConceptNet100k,
+        'DBpedia100k': DBpedia100k,
+        'YAGO310': YAGO310,
+        'AristoV4': AristoV4,
+    }
+    def __init__(self, root, graphs, transform=None, pre_transform=None):
+        self.graphs = [self.datasets_map[ds](root=root) for ds in graphs]
+        self.num_graphs = len(graphs)
+        super().__init__(root, transform, pre_transform)
+        self.data = torch.load(self.processed_paths[0])
+    @property
+    def raw_dir(self):
+        return os.path.join(self.root, "joint", f'{self.num_graphs}g', "raw")
+    @property
+    def processed_dir(self):
+        return os.path.join(self.root, "joint", f'{self.num_graphs}g', "processed")
+    @property
+    def processed_file_names(self):
+        return "data.pt"
+    def process(self):
+        train_data = [g[0] for g in self.graphs]
+        valid_data = [g[1] for g in self.graphs]
+        test_data = [g[2] for g in self.graphs]
+        # filter_data = [
+        #     Data(edge_index=g.data.target_edge_index, edge_type=g.data.target_edge_type, num_nodes=g[0].num_nodes) for g in self.graphs
+        # ]
+        torch.save((train_data, valid_data, test_data), self.processed_paths[0])

ultra/eval.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import math
+import torch
+from torch import distributed as dist
+from torch.utils import data as torch_data
+from torch_geometric.data import Data
+from ultra import tasks, util
+TRANSDUCTIVE = ("WordNet18RR", "RelLinkPredDataset", "CoDExSmall", "CoDExMedium", "CoDExLarge",
+                "YAGO310", "NELL995", "ConceptNet100k", "DBpedia100k", "Hetionet", "AristoV4",
+                "WDsinger", "NELL23k", "FB15k237_10", "FB15k237_20", "FB15k237_50")
+def get_filtered_data(dataset, mode):
+    train_data, valid_data, test_data = dataset[0], dataset[1], dataset[2]
+    ds_name = dataset.__class__.__name__
+    if ds_name in TRANSDUCTIVE:
+        filtered_data = Data(edge_index=dataset._data.target_edge_index, edge_type=dataset._data.target_edge_type, num_nodes=dataset[0].num_nodes)
+    else:
+        if "ILPC" in ds_name or "Ingram" in ds_name:
+            full_inference_edges = torch.cat([valid_data.edge_index, valid_data.target_edge_index, test_data.target_edge_index], dim=1)
+            full_inference_etypes = torch.cat([valid_data.edge_type, valid_data.target_edge_type, test_data.target_edge_type])
+            filtered_data = Data(edge_index=full_inference_edges, edge_type=full_inference_etypes, num_nodes=test_data.num_nodes)
+        else:
+            # test filtering graph: inference edges + test edges
+            full_inference_edges = torch.cat([test_data.edge_index, test_data.target_edge_index], dim=1)
+            full_inference_etypes = torch.cat([test_data.edge_type, test_data.target_edge_type])
+            if mode == "test":
+                filtered_data = Data(edge_index=full_inference_edges, edge_type=full_inference_etypes, num_nodes=test_data.num_nodes)
+            else:
+                # validation filtering graph: train edges + validation edges
+                filtered_data = Data(
+                    edge_index=torch.cat([train_data.edge_index, valid_data.target_edge_index], dim=1),
+                    edge_type=torch.cat([train_data.edge_type, valid_data.target_edge_type])
+                )
+    return filtered_data
+@torch.no_grad()
+def test(model, mode, dataset,  batch_size=32, eval_metrics=["mrr", "hits@10"], gpus=None, return_metrics=False):
+    logger = util.get_root_logger()
+    test_data = dataset[1] if mode == "valid" else dataset[2]
+    filtered_data = get_filtered_data(dataset, mode)
+    device = util.get_devices(gpus)
+    world_size = util.get_world_size()
+    rank = util.get_rank()
+    test_triplets = torch.cat([test_data.target_edge_index, test_data.target_edge_type.unsqueeze(0)]).t()
+    sampler = torch_data.DistributedSampler(test_triplets, world_size, rank)
+    test_loader = torch_data.DataLoader(test_triplets, batch_size, sampler=sampler)
+    model.eval()
+    rankings = []
+    num_negatives = []
+    tail_rankings, num_tail_negs = [], []  # for explicit tail-only evaluation needed for 5 datasets
+    for batch in test_loader:
+        t_batch, h_batch = tasks.all_negative(test_data, batch)
+        t_pred = model(test_data, t_batch)
+        h_pred = model(test_data, h_batch)
+        if filtered_data is None:
+            t_mask, h_mask = tasks.strict_negative_mask(test_data, batch)
+        else:
+            t_mask, h_mask = tasks.strict_negative_mask(filtered_data, batch)
+        pos_h_index, pos_t_index, pos_r_index = batch.t()
+        t_ranking = tasks.compute_ranking(t_pred, pos_t_index, t_mask)
+        h_ranking = tasks.compute_ranking(h_pred, pos_h_index, h_mask)
+        num_t_negative = t_mask.sum(dim=-1)
+        num_h_negative = h_mask.sum(dim=-1)
+        rankings += [t_ranking, h_ranking]
+        num_negatives += [num_t_negative, num_h_negative]
+        tail_rankings += [t_ranking]
+        num_tail_negs += [num_t_negative]
+    ranking = torch.cat(rankings)
+    num_negative = torch.cat(num_negatives)
+    all_size = torch.zeros(world_size, dtype=torch.long, device=device)
+    all_size[rank] = len(ranking)
+    # ugly repetitive code for tail-only ranks processing
+    tail_ranking = torch.cat(tail_rankings)
+    num_tail_neg = torch.cat(num_tail_negs)
+    all_size_t = torch.zeros(world_size, dtype=torch.long, device=device)
+    all_size_t[rank] = len(tail_ranking)
+    if world_size > 1:
+        dist.all_reduce(all_size, op=dist.ReduceOp.SUM)
+        dist.all_reduce(all_size_t, op=dist.ReduceOp.SUM)
+    # obtaining all ranks
+    cum_size = all_size.cumsum(0)
+    all_ranking = torch.zeros(all_size.sum(), dtype=torch.long, device=device)
+    all_ranking[cum_size[rank] - all_size[rank]: cum_size[rank]] = ranking
+    all_num_negative = torch.zeros(all_size.sum(), dtype=torch.long, device=device)
+    all_num_negative[cum_size[rank] - all_size[rank]: cum_size[rank]] = num_negative
+    # the same for tails-only ranks
+    cum_size_t = all_size_t.cumsum(0)
+    all_ranking_t = torch.zeros(all_size_t.sum(), dtype=torch.long, device=device)
+    all_ranking_t[cum_size_t[rank] - all_size_t[rank]: cum_size_t[rank]] = tail_ranking
+    all_num_negative_t = torch.zeros(all_size_t.sum(), dtype=torch.long, device=device)
+    all_num_negative_t[cum_size_t[rank] - all_size_t[rank]: cum_size_t[rank]] = num_tail_neg
+    if world_size > 1:
+        dist.all_reduce(all_ranking, op=dist.ReduceOp.SUM)
+        dist.all_reduce(all_num_negative, op=dist.ReduceOp.SUM)
+        dist.all_reduce(all_ranking_t, op=dist.ReduceOp.SUM)
+        dist.all_reduce(all_num_negative_t, op=dist.ReduceOp.SUM)
+    metrics = {}
+    if rank == 0:
+        for metric in eval_metrics:
+            if "-tail" in metric:
+                _metric_name, direction = metric.split("-")
+                if direction != "tail":
+                    raise ValueError("Only tail metric is supported in this mode")
+                _ranking = all_ranking_t
+                _num_neg = all_num_negative_t
+            else:
+                _ranking = all_ranking
+                _num_neg = all_num_negative
+                _metric_name = metric
+            if _metric_name == "mr":
+                score = _ranking.float().mean()
+            elif _metric_name == "mrr":
+                score = (1 / _ranking.float()).mean()
+            elif _metric_name.startswith("hits@"):
+                values = _metric_name[5:].split("_")
+                threshold = int(values[0])
+                if len(values) > 1:
+                    num_sample = int(values[1])
+                    # unbiased estimation
+                    fp_rate = (_ranking - 1).float() / _num_neg
+                    score = 0
+                    for i in range(threshold):
+                        # choose i false positive from num_sample - 1 negatives
+                        num_comb = math.factorial(num_sample - 1) / \
+                                   math.factorial(i) / math.factorial(num_sample - i - 1)
+                        score += num_comb * (fp_rate ** i) * ((1 - fp_rate) ** (num_sample - i - 1))
+                    score = score.mean()
+                else:
+                    score = (_ranking <= threshold).float().mean()
+            logger.warning("%s: %g" % (metric, score))
+            metrics[metric] = score
+    mrr = (1 / all_ranking.float()).mean()
+    return mrr if not return_metrics else metrics

ultra/layers.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch_scatter import scatter
+from torch_geometric.nn.conv import MessagePassing
+from torch_geometric.utils import degree
+from typing import Tuple
+class GeneralizedRelationalConv(MessagePassing):
+    eps = 1e-6
+    message2mul = {
+        "transe": "add",
+        "distmult": "mul",
+    }
+    # TODO for compile() - doesn't work currently
+    # propagate_type = {"edge_index": torch.LongTensor, "size": Tuple[int, int]}
+    def __init__(self, input_dim, output_dim, num_relation, query_input_dim, message_func="distmult",
+                 aggregate_func="pna", layer_norm=False, activation="relu", dependent=False, project_relations=False):
+        super(GeneralizedRelationalConv, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.num_relation = num_relation
+        self.query_input_dim = query_input_dim
+        self.message_func = message_func
+        self.aggregate_func = aggregate_func
+        self.dependent = dependent
+        self.project_relations = project_relations
+        if layer_norm:
+            self.layer_norm = nn.LayerNorm(output_dim)
+        else:
+            self.layer_norm = None
+        if isinstance(activation, str):
+            self.activation = getattr(F, activation)
+        else:
+            self.activation = activation
+        if self.aggregate_func == "pna":
+            self.linear = nn.Linear(input_dim * 13, output_dim)
+        else:
+            self.linear = nn.Linear(input_dim * 2, output_dim)
+        if dependent:
+            # obtain relation embeddings as a projection of the query relation
+            self.relation_linear = nn.Linear(query_input_dim, num_relation * input_dim)
+        else:
+            if not self.project_relations:
+                # relation embeddings as an independent embedding matrix per each layer
+                self.relation = nn.Embedding(num_relation, input_dim)
+            else:
+                # will be initialized after the pass over relation graph
+                self.relation = None
+                self.relation_projection = nn.Sequential(
+                    nn.Linear(input_dim, input_dim),
+                    nn.ReLU(),
+                    nn.Linear(input_dim, input_dim)
+                )
+    def forward(self, input, query, boundary, edge_index, edge_type, size, edge_weight=None):
+        batch_size = len(query)
+        if self.dependent:
+            # layer-specific relation features as a projection of input "query" (relation) embeddings
+            relation = self.relation_linear(query).view(batch_size, self.num_relation, self.input_dim)
+        else:
+            if not self.project_relations:
+                # layer-specific relation features as a special embedding matrix unique to each layer
+                relation = self.relation.weight.expand(batch_size, -1, -1)
+            else:
+                # NEW and only change:
+                # projecting relation features to unique features for this layer, then resizing for the current batch
+                relation = self.relation_projection(self.relation)
+        if edge_weight is None:
+            edge_weight = torch.ones(len(edge_type), device=input.device)
+        # note that we send the initial boundary condition (node states at layer0) to the message passing
+        # correspond to Eq.6 on p5 in https://arxiv.org/pdf/2106.06935.pdf
+        output = self.propagate(input=input, relation=relation, boundary=boundary, edge_index=edge_index,
+                                edge_type=edge_type, size=size, edge_weight=edge_weight)
+        return output
+    def propagate(self, edge_index, size=None, **kwargs):
+        if kwargs["edge_weight"].requires_grad or self.message_func == "rotate":
+            # the rspmm cuda kernel only works for TransE and DistMult message functions
+            # otherwise we invoke separate message & aggregate functions
+            return super(GeneralizedRelationalConv, self).propagate(edge_index, size, **kwargs)
+        for hook in self._propagate_forward_pre_hooks.values():
+            res = hook(self, (edge_index, size, kwargs))
+            if res is not None:
+                edge_index, size, kwargs = res
+        # in newer PyG,
+        # __check_input__ -> _check_input()
+        # __collect__ -> _collect()
+        # __fused_user_args__ -> _fuser_user_args
+        size = self._check_input(edge_index, size)
+        coll_dict = self._collect(self._fused_user_args, edge_index, size, kwargs)
+        msg_aggr_kwargs = self.inspector.distribute("message_and_aggregate", coll_dict)
+        for hook in self._message_and_aggregate_forward_pre_hooks.values():
+            res = hook(self, (edge_index, msg_aggr_kwargs))
+            if res is not None:
+                edge_index, msg_aggr_kwargs = res
+        out = self.message_and_aggregate(edge_index, **msg_aggr_kwargs)
+        for hook in self._message_and_aggregate_forward_hooks.values():
+            res = hook(self, (edge_index, msg_aggr_kwargs), out)
+            if res is not None:
+                out = res
+        update_kwargs = self.inspector.distribute("update", coll_dict)
+        out = self.update(out, **update_kwargs)
+        for hook in self._propagate_forward_hooks.values():
+            res = hook(self, (edge_index, size, kwargs), out)
+            if res is not None:
+                out = res
+        return out
+    def message(self, input_j, relation, boundary, edge_type):
+        relation_j = relation.index_select(self.node_dim, edge_type)
+        if self.message_func == "transe":
+            message = input_j + relation_j
+        elif self.message_func == "distmult":
+            message = input_j * relation_j
+        elif self.message_func == "rotate":
+            x_j_re, x_j_im = input_j.chunk(2, dim=-1)
+            r_j_re, r_j_im = relation_j.chunk(2, dim=-1)
+            message_re = x_j_re * r_j_re - x_j_im * r_j_im
+            message_im = x_j_re * r_j_im + x_j_im * r_j_re
+            message = torch.cat([message_re, message_im], dim=-1)
+        else:
+            raise ValueError("Unknown message function `%s`" % self.message_func)
+        # augment messages with the boundary condition
+        message = torch.cat([message, boundary], dim=self.node_dim)  # (num_edges + num_nodes, batch_size, input_dim)
+        return message
+    def aggregate(self, input, edge_weight, index, dim_size):
+        # augment aggregation index with self-loops for the boundary condition
+        index = torch.cat([index, torch.arange(dim_size, device=input.device)]) # (num_edges + num_nodes,)
+        edge_weight = torch.cat([edge_weight, torch.ones(dim_size, device=input.device)])
+        shape = [1] * input.ndim
+        shape[self.node_dim] = -1
+        edge_weight = edge_weight.view(shape)
+        if self.aggregate_func == "pna":
+            mean = scatter(input * edge_weight, index, dim=self.node_dim, dim_size=dim_size, reduce="mean")
+            sq_mean = scatter(input ** 2 * edge_weight, index, dim=self.node_dim, dim_size=dim_size, reduce="mean")
+            max = scatter(input * edge_weight, index, dim=self.node_dim, dim_size=dim_size, reduce="max")
+            min = scatter(input * edge_weight, index, dim=self.node_dim, dim_size=dim_size, reduce="min")
+            std = (sq_mean - mean ** 2).clamp(min=self.eps).sqrt()
+            features = torch.cat([mean.unsqueeze(-1), max.unsqueeze(-1), min.unsqueeze(-1), std.unsqueeze(-1)], dim=-1)
+            features = features.flatten(-2)
+            degree_out = degree(index, dim_size).unsqueeze(0).unsqueeze(-1)
+            scale = degree_out.log()
+            scale = scale / scale.mean()
+            scales = torch.cat([torch.ones_like(scale), scale, 1 / scale.clamp(min=1e-2)], dim=-1)
+            output = (features.unsqueeze(-1) * scales.unsqueeze(-2)).flatten(-2)
+        else:
+            output = scatter(input * edge_weight, index, dim=self.node_dim, dim_size=dim_size,
+                             reduce=self.aggregate_func)
+        return output
+    def message_and_aggregate(self, edge_index, input, relation, boundary, edge_type, edge_weight, index, dim_size):
+        # fused computation of message and aggregate steps with the custom rspmm cuda kernel
+        # speed up computation by several times
+        # reduce memory complexity from O(|E|d) to O(|V|d), so we can apply it to larger graphs
+        from ultra.rspmm.rspmm import generalized_rspmm
+        batch_size, num_node = input.shape[:2]
+        input = input.transpose(0, 1).flatten(1)
+        relation = relation.transpose(0, 1).flatten(1)
+        boundary = boundary.transpose(0, 1).flatten(1)
+        degree_out = degree(index, dim_size).unsqueeze(-1) + 1
+        if self.message_func in self.message2mul:
+            mul = self.message2mul[self.message_func]
+        else:
+            raise ValueError("Unknown message function `%s`" % self.message_func)
+        if self.aggregate_func == "sum":
+            update = generalized_rspmm(edge_index, edge_type, edge_weight, relation, input, sum="add", mul=mul)
+            update = update + boundary
+        elif self.aggregate_func == "mean":
+            update = generalized_rspmm(edge_index, edge_type, edge_weight, relation, input, sum="add", mul=mul)
+            update = (update + boundary) / degree_out
+        elif self.aggregate_func == "max":
+            update = generalized_rspmm(edge_index, edge_type, edge_weight, relation, input, sum="max", mul=mul)
+            update = torch.max(update, boundary)
+        elif self.aggregate_func == "pna":
+            # we use PNA with 4 aggregators (mean / max / min / std)
+            # and 3 scalars (identity / log degree / reciprocal of log degree)
+            sum = generalized_rspmm(edge_index, edge_type, edge_weight, relation, input, sum="add", mul=mul)
+            sq_sum = generalized_rspmm(edge_index, edge_type, edge_weight, relation ** 2, input ** 2, sum="add",
+                                       mul=mul)
+            max = generalized_rspmm(edge_index, edge_type, edge_weight, relation, input, sum="max", mul=mul)
+            min = generalized_rspmm(edge_index, edge_type, edge_weight, relation, input, sum="min", mul=mul)
+            mean = (sum + boundary) / degree_out
+            sq_mean = (sq_sum + boundary ** 2) / degree_out
+            max = torch.max(max, boundary)
+            min = torch.min(min, boundary) # (node, batch_size * input_dim)
+            std = (sq_mean - mean ** 2).clamp(min=self.eps).sqrt()
+            features = torch.cat([mean.unsqueeze(-1), max.unsqueeze(-1), min.unsqueeze(-1), std.unsqueeze(-1)], dim=-1)
+            features = features.flatten(-2) # (node, batch_size * input_dim * 4)
+            scale = degree_out.log()
+            scale = scale / scale.mean()
+            scales = torch.cat([torch.ones_like(scale), scale, 1 / scale.clamp(min=1e-2)], dim=-1) # (node, 3)
+            update = (features.unsqueeze(-1) * scales.unsqueeze(-2)).flatten(-2) # (node, batch_size * input_dim * 4 * 3)
+        else:
+            raise ValueError("Unknown aggregation function `%s`" % self.aggregate_func)
+        update = update.view(num_node, batch_size, -1).transpose(0, 1)
+        return update
+    def update(self, update, input):
+        # node update as a function of old states (input) and this layer output (update)
+        output = self.linear(torch.cat([input, update], dim=-1))
+        if self.layer_norm:
+            output = self.layer_norm(output)
+        if self.activation:
+            output = self.activation(output)
+        return output

ultra/models.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import torch
+from torch import nn
+from . import tasks, layers
+from ultra.base_nbfnet import BaseNBFNet
+class Ultra(nn.Module):
+    def __init__(self, rel_model_cfg, entity_model_cfg):
+        # kept that because super Ultra sounds cool
+        super(Ultra, self).__init__()
+        self.relation_model = RelNBFNet(**rel_model_cfg)
+        self.entity_model = EntityNBFNet(**entity_model_cfg)
+    def forward(self, data, batch):
+        # batch shape: (bs, 1+num_negs, 3)
+        # relations are the same all positive and negative triples, so we can extract only one from the first triple among 1+nug_negs
+        query_rels = batch[:, 0, 2]
+        relation_representations = self.relation_model(data.relation_graph, query=query_rels)
+        score = self.entity_model(data, relation_representations, batch)
+        return score
+# NBFNet to work on the graph of relations with 4 fundamental interactions
+# Doesn't have the final projection MLP from hidden dim -> 1, returns all node representations
+# of shape [bs, num_rel, hidden]
+class RelNBFNet(BaseNBFNet):
+    def __init__(self, input_dim, hidden_dims, num_relation=4, **kwargs):
+        super().__init__(input_dim, hidden_dims, num_relation, **kwargs)
+        self.layers = nn.ModuleList()
+        for i in range(len(self.dims) - 1):
+            self.layers.append(
+                layers.GeneralizedRelationalConv(
+                    self.dims[i], self.dims[i + 1], num_relation,
+                    self.dims[0], self.message_func, self.aggregate_func, self.layer_norm,
+                    self.activation, dependent=False)
+                )
+        if self.concat_hidden:
+            feature_dim = sum(hidden_dims) + input_dim
+            self.mlp = nn.Sequential(
+                nn.Linear(feature_dim, feature_dim),
+                nn.ReLU(),
+                nn.Linear(feature_dim, input_dim)
+            )
+    def bellmanford(self, data, h_index, separate_grad=False):
+        batch_size = len(h_index)
+        # initialize initial nodes (relations of interest in the batcj) with all ones
+        query = torch.ones(h_index.shape[0], self.dims[0], device=h_index.device, dtype=torch.float)
+        index = h_index.unsqueeze(-1).expand_as(query)
+        # initial (boundary) condition - initialize all node states as zeros
+        boundary = torch.zeros(batch_size, data.num_nodes, self.dims[0], device=h_index.device)
+        #boundary = torch.zeros(data.num_nodes, *query.shape, device=h_index.device)
+        # Indicator function: by the scatter operation we put ones as init features of source (index) nodes
+        boundary.scatter_add_(1, index.unsqueeze(1), query.unsqueeze(1))
+        size = (data.num_nodes, data.num_nodes)
+        edge_weight = torch.ones(data.num_edges, device=h_index.device)
+        hiddens = []
+        edge_weights = []
+        layer_input = boundary
+        for layer in self.layers:
+            # Bellman-Ford iteration, we send the original boundary condition in addition to the updated node states
+            hidden = layer(layer_input, query, boundary, data.edge_index, data.edge_type, size, edge_weight)
+            if self.short_cut and hidden.shape == layer_input.shape:
+                # residual connection here
+                hidden = hidden + layer_input
+            hiddens.append(hidden)
+            edge_weights.append(edge_weight)
+            layer_input = hidden
+        # original query (relation type) embeddings
+        node_query = query.unsqueeze(1).expand(-1, data.num_nodes, -1) # (batch_size, num_nodes, input_dim)
+        if self.concat_hidden:
+            output = torch.cat(hiddens + [node_query], dim=-1)
+            output = self.mlp(output)
+        else:
+            output = hiddens[-1]
+        return {
+            "node_feature": output,
+            "edge_weights": edge_weights,
+        }
+    def forward(self, rel_graph, query):
+        # message passing and updated node representations (that are in fact relations)
+        output = self.bellmanford(rel_graph, h_index=query)["node_feature"]  # (batch_size, num_nodes, hidden_dim）
+        return output
+class EntityNBFNet(BaseNBFNet):
+    def __init__(self, input_dim, hidden_dims, num_relation=1, **kwargs):
+        # dummy num_relation = 1 as we won't use it in the NBFNet layer
+        super().__init__(input_dim, hidden_dims, num_relation, **kwargs)
+        self.layers = nn.ModuleList()
+        for i in range(len(self.dims) - 1):
+            self.layers.append(
+                layers.GeneralizedRelationalConv(
+                    self.dims[i], self.dims[i + 1], num_relation,
+                    self.dims[0], self.message_func, self.aggregate_func, self.layer_norm,
+                    self.activation, dependent=False, project_relations=True)
+            )
+        feature_dim = (sum(hidden_dims) if self.concat_hidden else hidden_dims[-1]) + input_dim
+        self.mlp = nn.Sequential()
+        mlp = []
+        for i in range(self.num_mlp_layers - 1):
+            mlp.append(nn.Linear(feature_dim, feature_dim))
+            mlp.append(nn.ReLU())
+        mlp.append(nn.Linear(feature_dim, 1))
+        self.mlp = nn.Sequential(*mlp)
+    def bellmanford(self, data, h_index, r_index, separate_grad=False):
+        batch_size = len(r_index)
+        # initialize queries (relation types of the given triples)
+        query = self.query[torch.arange(batch_size, device=r_index.device), r_index]
+        index = h_index.unsqueeze(-1).expand_as(query)
+        # initial (boundary) condition - initialize all node states as zeros
+        boundary = torch.zeros(batch_size, data.num_nodes, self.dims[0], device=h_index.device)
+        # by the scatter operation we put query (relation) embeddings as init features of source (index) nodes
+        boundary.scatter_add_(1, index.unsqueeze(1), query.unsqueeze(1))
+        size = (data.num_nodes, data.num_nodes)
+        edge_weight = torch.ones(data.num_edges, device=h_index.device)
+        hiddens = []
+        edge_weights = []
+        layer_input = boundary
+        for layer in self.layers:
+            # for visualization
+            if separate_grad:
+                edge_weight = edge_weight.clone().requires_grad_()
+            # Bellman-Ford iteration, we send the original boundary condition in addition to the updated node states
+            hidden = layer(layer_input, query, boundary, data.edge_index, data.edge_type, size, edge_weight)
+            if self.short_cut and hidden.shape == layer_input.shape:
+                # residual connection here
+                hidden = hidden + layer_input
+            hiddens.append(hidden)
+            edge_weights.append(edge_weight)
+            layer_input = hidden
+        # original query (relation type) embeddings
+        node_query = query.unsqueeze(1).expand(-1, data.num_nodes, -1) # (batch_size, num_nodes, input_dim)
+        if self.concat_hidden:
+            output = torch.cat(hiddens + [node_query], dim=-1)
+        else:
+            output = torch.cat([hiddens[-1], node_query], dim=-1)
+        return {
+            "node_feature": output,
+            "edge_weights": edge_weights,
+        }
+    def forward(self, data, relation_representations, batch):
+        h_index, t_index, r_index = batch.unbind(-1)
+        # initial query representations are those from the relation graph
+        self.query = relation_representations
+        # initialize relations in each NBFNet layer (with uinque projection internally)
+        for layer in self.layers:
+            layer.relation = relation_representations
+        if self.training:
+            # Edge dropout in the training mode
+            # here we want to remove immediate edges (head, relation, tail) from the edge_index and edge_types
+            # to make NBFNet iteration learn non-trivial paths
+            data = self.remove_easy_edges(data, h_index, t_index, r_index)
+        shape = h_index.shape
+        # turn all triples in a batch into a tail prediction mode
+        h_index, t_index, r_index = self.negative_sample_to_tail(h_index, t_index, r_index, num_direct_rel=data.num_relations // 2)
+        assert (h_index[:, [0]] == h_index).all()
+        assert (r_index[:, [0]] == r_index).all()
+        # message passing and updated node representations
+        output = self.bellmanford(data, h_index[:, 0], r_index[:, 0])  # (num_nodes, batch_size, feature_dim）
+        feature = output["node_feature"]
+        index = t_index.unsqueeze(-1).expand(-1, -1, feature.shape[-1])
+        # extract representations of tail entities from the updated node states
+        feature = feature.gather(1, index)  # (batch_size, num_negative + 1, feature_dim)
+        # probability logit for each tail node in the batch
+        # (batch_size, num_negative + 1, dim) -> (batch_size, num_negative + 1)
+        score = self.mlp(feature).squeeze(-1)
+        return score.view(shape)

ultra/rspmm/rspmm.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import os
+import sys
+import torch.backends.openmp
+from torch import autograd
+from torch.utils import cpp_extension
+module = sys.modules[__name__]
+class RSPMMAddMulFunction(autograd.Function):
+    @staticmethod
+    def forward(ctx, edge_index, edge_type, edge_weight, relation, input):
+        node_in, node_out = edge_index
+        key = node_in * (node_out.max() + 1) + node_out
+        assert (key.diff() >= 0).all(), "Expect sorted `edge_index`"
+        if input.device.type == "cuda":
+            forward = rspmm.rspmm_add_mul_forward_cuda
+        else:
+            forward = rspmm.rspmm_add_mul_forward_cpu
+        output = forward(edge_index, edge_type, edge_weight, relation, input)
+        ctx.save_for_backward(edge_index, edge_type, edge_weight, relation, input, output)
+        return output
+    @staticmethod
+    def backward(ctx, output_grad):
+        if output_grad.device.type == "cuda":
+            backward = rspmm.rspmm_add_mul_backward_cuda
+        else:
+            backward = rspmm.rspmm_add_mul_backward_cpu
+        weight_grad, relation_grad, input_grad = backward(*ctx.saved_tensors, output_grad)
+        return None, None, weight_grad, relation_grad, input_grad
+class RSPMMMinMulFunction(autograd.Function):
+    @staticmethod
+    def forward(ctx, edge_index, edge_type, edge_weight, relation, input):
+        node_in, node_out = edge_index
+        key = node_in * (node_out.max() + 1) + node_out
+        assert (key.diff() >= 0).all(), "Expect sorted `edge_index`"
+        if input.device.type == "cuda":
+            forward = rspmm.rspmm_min_mul_forward_cuda
+        else:
+            forward = rspmm.rspmm_min_mul_forward_cpu
+        output = forward(edge_index, edge_type, edge_weight, relation, input)
+        ctx.save_for_backward(edge_index, edge_type, edge_weight, relation, input, output)
+        return output
+    @staticmethod
+    def backward(ctx, output_grad):
+        if output_grad.device.type == "cuda":
+            backward = rspmm.rspmm_min_mul_backward_cuda
+        else:
+            backward = rspmm.rspmm_min_mul_backward_cpu
+        weight_grad, relation_grad, input_grad = backward(*ctx.saved_tensors, output_grad)
+        return None, None, weight_grad, relation_grad, input_grad
+class RSPMMMaxMulFunction(autograd.Function):
+    @staticmethod
+    def forward(ctx, edge_index, edge_type, edge_weight, relation, input):
+        node_in, node_out = edge_index
+        key = node_in * (node_out.max() + 1) + node_out
+        assert (key.diff() >= 0).all(), "Expect sorted `edge_index`"
+        if input.device.type == "cuda":
+            forward = rspmm.rspmm_max_mul_forward_cuda
+        else:
+            forward = rspmm.rspmm_max_mul_forward_cpu
+        output = forward(edge_index, edge_type, edge_weight, relation, input)
+        ctx.save_for_backward(edge_index, edge_type, edge_weight, relation, input, output)
+        return output
+    @staticmethod
+    def backward(ctx, output_grad):
+        if output_grad.device.type == "cuda":
+            backward = rspmm.rspmm_max_mul_backward_cuda
+        else:
+            backward = rspmm.rspmm_max_mul_backward_cpu
+        weight_grad, relation_grad, input_grad = backward(*ctx.saved_tensors, output_grad)
+        return None, None, weight_grad, relation_grad, input_grad
+class RSPMMAddAddFunction(autograd.Function):
+    @staticmethod
+    def forward(ctx, edge_index, edge_type, edge_weight, relation, input):
+        node_in, node_out = edge_index
+        key = node_in * (node_out.max() + 1) + node_out
+        assert (key.diff() >= 0).all(), "Expect sorted `edge_index`"
+        if input.device.type == "cuda":
+            forward = rspmm.rspmm_add_add_forward_cuda
+        else:
+            forward = rspmm.rspmm_add_add_forward_cpu
+        output = forward(edge_index, edge_type, edge_weight, relation, input)
+        ctx.save_for_backward(edge_index, edge_type, edge_weight, relation, input, output)
+        return output
+    @staticmethod
+    def backward(ctx, output_grad):
+        if output_grad.device.type == "cuda":
+            backward = rspmm.rspmm_add_add_backward_cuda
+        else:
+            backward = rspmm.rspmm_add_add_backward_cpu
+        weight_grad, relation_grad, input_grad = backward(*ctx.saved_tensors, output_grad)
+        return None, None, weight_grad, relation_grad, input_grad
+class RSPMMMinAddFunction(autograd.Function):
+    @staticmethod
+    def forward(ctx, edge_index, edge_type, edge_weight, relation, input):
+        node_in, node_out = edge_index
+        key = node_in * (node_out.max() + 1) + node_out
+        assert (key.diff() >= 0).all(), "Expect sorted `edge_index`"
+        if input.device.type == "cuda":
+            forward = rspmm.rspmm_min_add_forward_cuda
+        else:
+            forward = rspmm.rspmm_min_add_forward_cpu
+        output = forward(edge_index, edge_type, edge_weight, relation, input)
+        ctx.save_for_backward(edge_index, edge_type, edge_weight, relation, input, output)
+        return output
+    @staticmethod
+    def backward(ctx, output_grad):
+        if output_grad.device.type == "cuda":
+            backward = rspmm.rspmm_min_add_backward_cuda
+        else:
+            backward = rspmm.rspmm_min_add_backward_cpu
+        weight_grad, relation_grad, input_grad = backward(*ctx.saved_tensors, output_grad)
+        return None, None, weight_grad, relation_grad, input_grad
+class RSPMMMaxAddFunction(autograd.Function):
+    @staticmethod
+    def forward(ctx, edge_index, edge_type, edge_weight, relation, input):
+        node_in, node_out = edge_index
+        key = node_in * (node_out.max() + 1) + node_out
+        assert (key.diff() >= 0).all(), "Expect sorted `edge_index`"
+        if input.device.type == "cuda":
+            forward = rspmm.rspmm_max_add_forward_cuda
+        else:
+            forward = rspmm.rspmm_max_add_forward_cpu
+        output = forward(edge_index, edge_type, edge_weight, relation, input)
+        ctx.save_for_backward(edge_index, edge_type, edge_weight, relation, input, output)
+        return output
+    @staticmethod
+    def backward(ctx, output_grad):
+        if output_grad.device.type == "cuda":
+            backward = rspmm.rspmm_max_add_backward_cuda
+        else:
+            backward = rspmm.rspmm_max_add_backward_cpu
+        weight_grad, relation_grad, input_grad = backward(*ctx.saved_tensors, output_grad)
+        return None, None, weight_grad, relation_grad, input_grad
+def generalized_rspmm(edge_index, edge_type, edge_weight, relation, input, sum="add", mul="mul"):
+    name = "RSPMM%s%sFunction" % (sum.capitalize(), mul.capitalize())
+    if not hasattr(module, name):
+        raise ValueError("No generalized rspmm implementation found for summation `%s` and multiplication `%s`"
+                         % (sum, mul))
+    Function = getattr(module, name)
+    node_in, node_out = edge_index
+    key = node_in * (node_out.max() + 1) + node_out
+    order = key.argsort()
+    return Function.apply(edge_index[:, order], edge_type[order], edge_weight[order], relation, input)
+def load_extension(name, sources, extra_cflags=None, extra_cuda_cflags=None, **kwargs):
+    if extra_cflags is None:
+        extra_cflags = ["-Ofast"]
+        if torch.backends.openmp.is_available():
+            extra_cflags += ["-fopenmp", "-DAT_PARALLEL_OPENMP"]
+        else:
+            extra_cflags.append("-DAT_PARALLEL_NATIVE")
+    if extra_cuda_cflags is None:
+        if torch.cuda.is_available():
+            extra_cuda_cflags = ["-O3"]
+            extra_cflags.append("-DCUDA_OP")
+        else:
+            new_sources = []
+            for source in sources:
+                if not cpp_extension._is_cuda_file(source):
+                    new_sources.append(source)
+            sources = new_sources
+    return cpp_extension.load(name, sources, extra_cflags, extra_cuda_cflags, **kwargs)
+print("Load rspmm extension. This may take a while...")
+path = os.path.join(os.path.dirname(__file__), "source")
+rspmm = load_extension("rspmm", [os.path.join(path, "rspmm.cpp"), os.path.join(path, "rspmm.cu")])

ultra/rspmm/source/operator.cuh ADDED Viewed

	@@ -0,0 +1,82 @@

+#pragma once
+#include <limits>
+#ifdef __CUDA_ARCH__
+    #define HOST_DEVICE __host__ __device__
+#else
+    #define HOST_DEVICE
+#endif
+namespace at {
+template <class scalar_t>
+struct BinaryAdd {
+    HOST_DEVICE static scalar_t forward(scalar_t x, scalar_t y) {
+        return x + y;
+    }
+    HOST_DEVICE static scalar_t backward_lhs(scalar_t x, scalar_t y) {
+        return 1;
+    }
+    HOST_DEVICE static scalar_t backward_rhs(scalar_t x, scalar_t y) {
+        return 1;
+    }
+};
+template <class scalar_t>
+struct BinaryMul {
+    HOST_DEVICE static scalar_t forward(scalar_t x, scalar_t y) {
+        return x * y;
+    }
+    HOST_DEVICE static scalar_t backward_lhs(scalar_t x, scalar_t y) {
+        return y;
+    }
+    HOST_DEVICE static scalar_t backward_rhs(scalar_t x, scalar_t y) {
+        return x;
+    }
+};
+template <class scalar_t>
+struct NaryAdd {
+    HOST_DEVICE static scalar_t forward(scalar_t result, scalar_t x) {
+        return result + x;
+    }
+    HOST_DEVICE static scalar_t backward(scalar_t result, scalar_t x) {
+        return 1;
+    }
+    static constexpr scalar_t zero = 0;
+};
+template <class scalar_t>
+struct NaryMin {
+    HOST_DEVICE static scalar_t forward(scalar_t result, scalar_t x) {
+        return result < x ? result : x;
+    }
+    HOST_DEVICE static scalar_t backward(scalar_t result, scalar_t x) {
+        return result == x ? 1 : 0;
+    }
+    static constexpr scalar_t zero = std::numeric_limits<scalar_t>::max();
+};
+template <class scalar_t>
+struct NaryMax {
+    HOST_DEVICE static scalar_t forward(scalar_t result, scalar_t x) {
+        return result > x ? result : x;
+    }
+    HOST_DEVICE static scalar_t backward(scalar_t result, scalar_t x) {
+        return result == x ? 1 : 0;
+    }
+    static constexpr scalar_t zero = std::numeric_limits<scalar_t>::lowest();
+};
+} // namespace at

ultra/rspmm/source/rspmm.cpp ADDED Viewed

	@@ -0,0 +1,283 @@

+#include <mutex>
+#include <ATen/Parallel.h>
+#include "operator.cuh"
+#include "rspmm.h"
+namespace at {
+// In PyTorch 1.4.0, parallel_for depends on some functions from at::internal in ATen/Parallel.h
+// which are not explicitly included
+// This is fixed in some new PyTorch release
+using namespace at::internal;
+void rspmm_forward_check(CheckedFrom c, const TensorArg &edge_index_arg, const TensorArg &edge_type_arg,
+                         const TensorArg &edge_weight_arg, const TensorArg &relation_arg, const TensorArg &input_arg) {
+    checkDim(c, edge_index_arg, 2);
+    checkDim(c, edge_type_arg, 1);
+    checkDim(c, edge_weight_arg, 1);
+    checkDim(c, relation_arg, 2);
+    checkDim(c, input_arg, 2);
+    checkSameType(c, edge_index_arg, edge_type_arg);
+    checkAllSameType(c, {edge_weight_arg, relation_arg, input_arg});
+    checkSize(c, edge_index_arg, 0, 2);
+    checkSize(c, edge_type_arg, {edge_index_arg->size(1)});
+    checkSize(c, edge_weight_arg, {edge_index_arg->size(1)});
+    checkSize(c, relation_arg, 1, input_arg->size(1));
+}
+void rspmm_backward_check(CheckedFrom c, const TensorArg &edge_index_arg, const TensorArg &edge_type_arg,
+                          const TensorArg &edge_weight_arg, const TensorArg &relation_arg, const TensorArg &input_arg,
+                          const TensorArg &output_arg, const TensorArg &output_grad_arg) {
+    rspmm_forward_check(c, edge_index_arg, edge_type_arg, edge_weight_arg, relation_arg, input_arg);
+    checkDim(c, output_arg, 2);
+    checkSameSize(c, output_arg, output_grad_arg);
+    checkAllSameType(c, {input_arg, output_arg, output_grad_arg});
+    checkSize(c, output_arg, 1, input_arg->size(1));
+}
+Tensor ind2ptr(const Tensor &index, int size) {
+    // scatter_add is super slow for int64, due to non-hardware atomic operations
+    // use int32 instead
+    Tensor num_per_index = at::zeros({size}, index.options().dtype(at::ScalarType::Int));
+    num_per_index.scatter_add_(0, index, at::ones(index.sizes(), num_per_index.options()));
+    num_per_index = num_per_index.toType(at::ScalarType::Long);
+    Tensor pointer = num_per_index.cumsum(0) - num_per_index;
+    return pointer;
+}
+template <class scalar_t, class NaryOp, class BinaryOp>
+void rspmm_forward_out_cpu(const int64_t *row_ptr, const int64_t *col_ind, const int64_t *layer_ind,
+                           const scalar_t *weight, const scalar_t *relation, const scalar_t *input,
+                           scalar_t *output,
+                           int64_t num_row, int64_t nnz, int64_t dim) {
+    parallel_for(0, num_row, 0, [&](int64_t row_start, int64_t row_end) {
+        for (int64_t row = row_start; row < row_end; row++) {
+            for (int64_t d = 0; d < dim; d++)
+                output[row * dim + d] = NaryOp::zero;
+            int64_t ptr_start = row_ptr[row];
+            int64_t ptr_end = row + 1 < num_row ? row_ptr[row + 1] : nnz;
+            for (int64_t ptr = ptr_start; ptr < ptr_end; ptr++) {
+                int64_t col = col_ind[ptr];
+                int64_t layer = layer_ind[ptr];
+                scalar_t w = weight[ptr];
+                for (int64_t d = 0; d < dim; d++) {
+                    scalar_t x = BinaryOp::forward(relation[layer * dim + d], input[col * dim + d]);
+                    scalar_t y = w * x;
+                    scalar_t &out = output[row * dim + d];
+                    out = NaryOp::forward(out, y);
+                }
+            }
+        }
+    });
+}
+template <class scalar_t, class NaryOp, class BinaryOp>
+void rspmm_backward_out_cpu(const int64_t *row_ptr, const int64_t *col_ind, const int64_t *layer_ind,
+                            const scalar_t *weight, const scalar_t *relation, const scalar_t *input,
+                            const scalar_t *output, const scalar_t *output_grad,
+                            scalar_t *weight_grad, scalar_t *relation_grad, scalar_t *input_grad,
+                            int64_t num_row, int64_t nnz, int64_t dim,
+                            std::vector<std::mutex> &relation_mutex, std::vector<std::mutex> &input_mutex) {
+    parallel_for(0, num_row, 0, [&](int64_t row_start, int64_t row_end) {
+        for (int64_t row = row_start; row < row_end; row++) {
+            int64_t ptr_start = row_ptr[row];
+            int64_t ptr_end = row + 1 < num_row ? row_ptr[row + 1] : nnz;
+            for (int64_t ptr = ptr_start; ptr < ptr_end; ptr++) {
+                int64_t col = col_ind[ptr];
+                int64_t layer = layer_ind[ptr];
+                scalar_t w = weight[ptr];
+                scalar_t w_grad = 0;
+                for (int64_t d = 0; d < dim; d++) {
+                    scalar_t rel = relation[layer * dim + d];
+                    scalar_t in = input[col * dim + d];
+                    scalar_t out = output[row * dim + d];
+                    scalar_t out_grad = output_grad[row * dim + d];
+                    scalar_t x = BinaryOp::forward(rel, in);
+                    scalar_t y = w * x;
+                    scalar_t dx_drel = BinaryOp::backward_lhs(rel, in);
+                    scalar_t dx_din = BinaryOp::backward_rhs(rel, in);
+                    scalar_t dout_dy = NaryOp::backward(out, y);
+                    scalar_t dy_dw = x;
+                    scalar_t dy_dx = w;
+                    w_grad += out_grad * dout_dy * dy_dw;
+                    {
+                        std::lock_guard<std::mutex> lock(relation_mutex[layer * dim + d]);
+                        relation_grad[layer * dim + d] += out_grad * dout_dy * dy_dx * dx_drel;
+                    }
+                    {
+                        std::lock_guard<std::mutex> lock(input_mutex[col * dim + d]);
+                        input_grad[col * dim + d] += out_grad * dout_dy * dy_dx * dx_din;
+                    }
+                }
+                weight_grad[ptr] = w_grad;
+            }
+        }
+    });
+}
+template <template<class> class NaryOp, template<class> class BinaryOp>
+Tensor rspmm_forward_cpu(const Tensor &edge_index_, const Tensor &edge_type_, const Tensor &edge_weight_,
+                         const Tensor &relation_, const Tensor &input_) {
+    constexpr const char *fn_name = "rspmm_forward_cpu";
+    TensorArg edge_index_arg(edge_index_, "edge_index", 1), edge_type_arg(edge_type_, "edge_type", 2),
+              edge_weight_arg(edge_weight_, "edge_weight", 3), relation_arg(relation_, "relation", 4),
+              input_arg(input_, "input", 5);
+    rspmm_forward_check(fn_name, edge_index_arg, edge_type_arg, edge_weight_arg, relation_arg, input_arg);
+    checkDeviceType(fn_name, {edge_index_, edge_type_, edge_weight_, relation_, input_}, kCPU);
+    const Tensor edge_index = edge_index_.contiguous();
+    const Tensor edge_type = edge_type_.contiguous();
+    const Tensor edge_weight = edge_weight_.contiguous();
+    const Tensor relation = relation_.contiguous();
+    const Tensor input = input_.contiguous();
+    int64_t nnz = edge_index.size(0);
+    int64_t num_row = input.size(0);
+    int64_t dim = input.size(1);
+    Tensor output = at::empty({num_row, dim}, input.options());
+    Tensor row_ind = edge_index.select(0, 0);
+    Tensor row_ptr = ind2ptr(row_ind, num_row);
+    Tensor col_ind = edge_index.select(0, 1);
+    Tensor layer_ind = edge_type;
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rspmm_forward_cpu", [&] {
+        rspmm_forward_out_cpu<scalar_t, NaryOp<scalar_t>, BinaryOp<scalar_t>>(
+            row_ptr.data_ptr<int64_t>(),
+            col_ind.data_ptr<int64_t>(),
+            layer_ind.data_ptr<int64_t>(),
+            edge_weight.data_ptr<scalar_t>(),
+            relation.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(),
+            num_row, nnz, dim
+        );
+    });
+    return output;
+}
+template <template<class> class NaryOp, template<class> class BinaryOp>
+std::tuple<Tensor, Tensor, Tensor> rspmm_backward_cpu(
+        const Tensor &edge_index_, const Tensor &edge_type_, const Tensor &edge_weight_,
+        const Tensor &relation_, const Tensor &input_, const Tensor &output_, const Tensor &output_grad_) {
+    constexpr const char *fn_name = "rspmm_backward_cpu";
+    TensorArg edge_index_arg(edge_index_, "edge_index", 1), edge_type_arg(edge_type_, "edge_type", 2),
+              edge_weight_arg(edge_weight_, "edge_weight", 3), relation_arg(relation_, "relation", 4),
+              input_arg(input_, "input", 5), output_arg(output_, "output", 6),
+              output_grad_arg(output_grad_, "output_grad", 7);
+    rspmm_backward_check(fn_name, edge_index_arg, edge_type_arg, edge_weight_arg, relation_arg, input_arg,
+                         output_arg, output_grad_arg);
+    checkDeviceType(fn_name, {edge_index_, edge_type_, edge_weight_, relation_, input_, output_, output_grad_}, kCPU);
+    const Tensor edge_index = edge_index_.contiguous();
+    const Tensor edge_type = edge_type_.contiguous();
+    const Tensor edge_weight = edge_weight_.contiguous();
+    const Tensor relation = relation_.contiguous();
+    const Tensor input = input_.contiguous();
+    const Tensor output = output_.contiguous();
+    const Tensor output_grad = output_grad_.contiguous();
+    int64_t nnz = edge_index.size(0);
+    int64_t num_row = input.size(0);
+    int64_t dim = input.size(1);
+    Tensor weight_grad = at::zeros_like(edge_weight);
+    Tensor relation_grad = at::zeros_like(relation);
+    Tensor input_grad = at::zeros_like(input);
+    Tensor row_ind = edge_index.select(0, 0);
+    Tensor row_ptr = ind2ptr(row_ind, num_row);
+    Tensor col_ind = edge_index.select(0, 1);
+    Tensor layer_ind = edge_type;
+    std::vector<std::mutex> relation_mutex(relation.numel());
+    std::vector<std::mutex> input_mutex(input.numel());
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rspmm_backward_cpu", [&] {
+        rspmm_backward_out_cpu<scalar_t, NaryOp<scalar_t>, BinaryOp<scalar_t>>(
+            row_ptr.data_ptr<int64_t>(),
+            col_ind.data_ptr<int64_t>(),
+            layer_ind.data_ptr<int64_t>(),
+            edge_weight.data_ptr<scalar_t>(),
+            relation.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(),
+            output_grad.data_ptr<scalar_t>(),
+            weight_grad.data_ptr<scalar_t>(),
+            relation_grad.data_ptr<scalar_t>(),
+            input_grad.data_ptr<scalar_t>(),
+            num_row, nnz, dim,
+            relation_mutex, input_mutex
+        );
+    });
+    return std::make_tuple(weight_grad, relation_grad, input_grad);
+}
+#define DECLARE_FORWARD_IMPL(ADD, MUL, NARYOP, BINARYOP) \
+    Tensor rspmm_##ADD##_##MUL##_forward_cpu(                                                            \
+            const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,                \
+            const Tensor &relation, const Tensor &input) {                                               \
+        return rspmm_forward_cpu<NARYOP, BINARYOP>(edge_index, edge_type, edge_weight, relation, input); \
+    }
+#define DECLARE_BACKWARD_IMPL(ADD, MUL, NARYOP, BINARYOP) \
+    std::tuple<Tensor, Tensor, Tensor> rspmm_##ADD##_##MUL##_backward_cpu(                                  \
+            const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,                   \
+            const Tensor &relation, const Tensor &input, const Tensor &output, const Tensor &output_grad) { \
+        return rspmm_backward_cpu<NARYOP, BINARYOP>(edge_index, edge_type, edge_weight, relation, input,    \
+                                                     output, output_grad);                                  \
+    }
+DECLARE_FORWARD_IMPL(add, mul, NaryAdd, BinaryMul)
+DECLARE_BACKWARD_IMPL(add, mul, NaryAdd, BinaryMul)
+DECLARE_FORWARD_IMPL(min, mul, NaryMin, BinaryMul)
+DECLARE_BACKWARD_IMPL(min, mul, NaryMin, BinaryMul)
+DECLARE_FORWARD_IMPL(max, mul, NaryMax, BinaryMul)
+DECLARE_BACKWARD_IMPL(max, mul, NaryMax, BinaryMul)
+DECLARE_FORWARD_IMPL(add, add, NaryAdd, BinaryAdd)
+DECLARE_BACKWARD_IMPL(add, add, NaryAdd, BinaryAdd)
+DECLARE_FORWARD_IMPL(min, add, NaryMin, BinaryAdd)
+DECLARE_BACKWARD_IMPL(min, add, NaryMin, BinaryAdd)
+DECLARE_FORWARD_IMPL(max, add, NaryMax, BinaryAdd)
+DECLARE_BACKWARD_IMPL(max, add, NaryMax, BinaryAdd)
+} // namespace at
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("rspmm_add_mul_forward_cpu", &at::rspmm_add_mul_forward_cpu);
+    m.def("rspmm_add_mul_backward_cpu", &at::rspmm_add_mul_backward_cpu);
+    m.def("rspmm_min_mul_forward_cpu", &at::rspmm_min_mul_forward_cpu);
+    m.def("rspmm_min_mul_backward_cpu", &at::rspmm_min_mul_backward_cpu);
+    m.def("rspmm_max_mul_forward_cpu", &at::rspmm_max_mul_forward_cpu);
+    m.def("rspmm_max_mul_backward_cpu", &at::rspmm_max_mul_backward_cpu);
+    m.def("rspmm_add_add_forward_cpu", &at::rspmm_add_add_forward_cpu);
+    m.def("rspmm_add_add_backward_cpu", &at::rspmm_add_add_backward_cpu);
+    m.def("rspmm_min_add_forward_cpu", &at::rspmm_min_add_forward_cpu);
+    m.def("rspmm_min_add_backward_cpu", &at::rspmm_min_add_backward_cpu);
+    m.def("rspmm_max_add_forward_cpu", &at::rspmm_max_add_forward_cpu);
+    m.def("rspmm_max_add_backward_cpu", &at::rspmm_max_add_backward_cpu);
+#ifdef CUDA_OP
+    m.def("rspmm_add_mul_forward_cuda", &at::rspmm_add_mul_forward_cuda);
+    m.def("rspmm_add_mul_backward_cuda", &at::rspmm_add_mul_backward_cuda);
+    m.def("rspmm_min_mul_forward_cuda", &at::rspmm_min_mul_forward_cuda);
+    m.def("rspmm_min_mul_backward_cuda", &at::rspmm_min_mul_backward_cuda);
+    m.def("rspmm_max_mul_forward_cuda", &at::rspmm_max_mul_forward_cuda);
+    m.def("rspmm_max_mul_backward_cuda", &at::rspmm_max_mul_backward_cuda);
+    m.def("rspmm_add_add_forward_cuda", &at::rspmm_add_add_forward_cuda);
+    m.def("rspmm_add_add_backward_cuda", &at::rspmm_add_add_backward_cuda);
+    m.def("rspmm_min_add_forward_cuda", &at::rspmm_min_add_forward_cuda);
+    m.def("rspmm_min_add_backward_cuda", &at::rspmm_min_add_backward_cuda);
+    m.def("rspmm_max_add_forward_cuda", &at::rspmm_max_add_forward_cuda);
+    m.def("rspmm_max_add_backward_cuda", &at::rspmm_max_add_backward_cuda);
+#endif
+}

ultra/rspmm/source/rspmm.cu ADDED Viewed

	@@ -0,0 +1,386 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#include "util.cuh"
+#include "operator.cuh"
+#include "rspmm.h"
+namespace at {
+// Memory & time efficient implementation of generalized spmm
+// Much of the code is inspired by GE-SpMM
+// https://github.com/hgyhungry/ge-spmm
+namespace {
+const int kCoarseningFactor = 2;
+const int kThreadPerBlock = 256;
+} // namespace anonymous
+template <class scalar_t, class NaryOp, class BinaryOp>
+__global__
+void rspmm_forward_out_cuda(const int64_t *row_ptr, const int64_t *col_ind, const int64_t *layer_ind,
+                            const scalar_t *weight, const scalar_t *relation, const scalar_t *input,
+                            scalar_t *output,
+                            int64_t num_row, int64_t nnz, int64_t dim) {
+    // for best optimization, the following code is compiled with constant warpSize
+    assert(blockDim.x == warpSize);
+    extern __shared__ int64_t buffer[];
+    int64_t *col_ind_buf = buffer;
+    int64_t *layer_ind_buf = buffer + blockDim.y * warpSize;
+    scalar_t *weight_buf = reinterpret_cast<scalar_t *>(layer_ind_buf + blockDim.y * warpSize);
+    col_ind_buf += threadIdx.y * warpSize;
+    layer_ind_buf += threadIdx.y * warpSize;
+    weight_buf += threadIdx.y * warpSize;
+    int64_t row = blockIdx.x * blockDim.y + threadIdx.y;
+    if (row >= num_row)
+        return;
+    int64_t d_start = blockIdx.y * warpSize * kCoarseningFactor + threadIdx.x;
+    int64_t ptr_start = row_ptr[row];
+    int64_t ptr_end = row + 1 < num_row ? row_ptr[row + 1] : nnz;
+    scalar_t out[kCoarseningFactor];
+#pragma unroll
+    for (int64_t i = 0; i < kCoarseningFactor; i++)
+        out[i] = NaryOp::zero;
+    for (int64_t block_ptr = ptr_start; block_ptr < ptr_end; block_ptr += warpSize) {
+        int64_t ptr = block_ptr + threadIdx.x;
+        if (ptr < ptr_end) {
+            col_ind_buf[threadIdx.x] = col_ind[ptr];
+            layer_ind_buf[threadIdx.x] = layer_ind[ptr];
+            weight_buf[threadIdx.x] = weight[ptr];
+        }
+        __syncwarp();
+        int64_t max_offset = warpSize < ptr_end - block_ptr ? warpSize : ptr_end - block_ptr;
+        for (int64_t offset_ptr = 0; offset_ptr < max_offset; offset_ptr++) {
+            int64_t col = col_ind_buf[offset_ptr];
+            int64_t layer = layer_ind_buf[offset_ptr];
+            scalar_t w = weight_buf[offset_ptr];
+#pragma unroll
+            for (int64_t i = 0; i < kCoarseningFactor; i++) {
+                int64_t d = d_start + i * warpSize;
+                if (d >= dim)
+                    break;
+                scalar_t x = BinaryOp::forward(relation[layer * dim + d], input[col * dim + d]);
+                scalar_t y = w * x;
+                out[i] = NaryOp::forward(out[i], y);
+            }
+        }
+        __syncwarp();
+    }
+#pragma unroll
+    for (int64_t i = 0; i < kCoarseningFactor; i++) {
+        int64_t d = d_start + i * warpSize;
+        if (d >= dim)
+            break;
+        output[row * dim + d] = out[i];
+    }
+}
+template <class scalar_t, class NaryOp, class BinaryOp>
+__global__
+void rspmm_backward_out_cuda(const int64_t *row_ptr, const int64_t *col_ind, const int64_t *layer_ind,
+                             const scalar_t *weight, const scalar_t *relation, const scalar_t *input,
+                             const scalar_t *output, const scalar_t *output_grad,
+                             scalar_t *weight_grad, scalar_t *relation_grad, scalar_t *input_grad,
+                             int64_t num_row, int64_t nnz, int64_t dim) {
+    // for best optimization, the following code is compiled with constant warpSize
+    assert(blockDim.x == warpSize);
+    extern __shared__ int64_t buffer[];
+    int64_t *col_ind_buf = buffer;
+    int64_t *layer_ind_buf = col_ind_buf + blockDim.y * warpSize;
+    scalar_t *weight_buf = reinterpret_cast<scalar_t *>(layer_ind_buf + blockDim.y * warpSize);
+    col_ind_buf += threadIdx.y * warpSize;
+    layer_ind_buf += threadIdx.y * warpSize;
+    weight_buf += threadIdx.y * warpSize;
+    int64_t row = blockIdx.x * blockDim.y + threadIdx.y;
+    if (row >= num_row)
+        return;
+    int64_t d_start = blockIdx.y * warpSize * kCoarseningFactor + threadIdx.x;
+    int64_t ptr_start = row_ptr[row];
+    int64_t ptr_end = row + 1 < num_row ? row_ptr[row + 1] : nnz;
+    for (int64_t block_ptr = ptr_start; block_ptr < ptr_end; block_ptr += warpSize) {
+        int64_t ptr = block_ptr + threadIdx.x;
+        if (ptr < ptr_end) {
+            col_ind_buf[threadIdx.x] = col_ind[ptr];
+            layer_ind_buf[threadIdx.x] = layer_ind[ptr];
+            weight_buf[threadIdx.x] = weight[ptr];
+        }
+        __syncwarp();
+        int64_t max_offset = warpSize < ptr_end - block_ptr ? warpSize : ptr_end - block_ptr;
+        for (int64_t offset_ptr = 0; offset_ptr < max_offset; offset_ptr++) {
+            int64_t col = col_ind_buf[offset_ptr];
+            int64_t layer = layer_ind_buf[offset_ptr];
+            scalar_t w = weight_buf[offset_ptr];
+            scalar_t w_grad = 0;
+#pragma unroll
+            for (int64_t i = 0; i < kCoarseningFactor; i++) {
+                int64_t d = d_start + i * warpSize;
+                if (d >= dim)
+                    break;
+                scalar_t rel = relation[layer * dim + d];
+                scalar_t in = input[col * dim + d];
+                scalar_t out = output[row * dim + d];
+                scalar_t out_grad = output_grad[row * dim + d];
+                scalar_t x = BinaryOp::forward(rel, in);
+                scalar_t y = w * x;
+                scalar_t dx_drel = BinaryOp::backward_lhs(rel, in);
+                scalar_t dx_din = BinaryOp::backward_rhs(rel, in);
+                scalar_t dout_dy = NaryOp::backward(out, y);
+                scalar_t dy_dw = x;
+                scalar_t dy_dx = w;
+                w_grad += out_grad * dout_dy * dy_dw;
+                atomicAdd(&relation_grad[layer * dim + d], out_grad * dout_dy * dy_dx * dx_drel);
+                atomicAdd(&input_grad[col * dim + d], out_grad * dout_dy * dy_dx * dx_din);
+            }
+            w_grad = warp_reduce(w_grad);
+            if (threadIdx.x == 0)
+                atomicAdd(&weight_grad[block_ptr + offset_ptr], w_grad);
+        }
+        __syncwarp();
+    }
+}
+// only relation & input require gradients
+template <class scalar_t, class NaryOp, class BinaryOp>
+__global__
+void rspmm_backward_out_cuda(const int64_t *row_ptr, const int64_t *col_ind, const int64_t *layer_ind,
+                             const scalar_t *weight, const scalar_t *relation, const scalar_t *input,
+                             const scalar_t *output, const scalar_t *output_grad,
+                             scalar_t *relation_grad, scalar_t *input_grad,
+                             int64_t num_row, int64_t nnz, int64_t dim) {
+    // for best optimization, the following code is compiled with constant warpSize
+    assert(blockDim.x == warpSize);
+    extern __shared__ int64_t buffer[];
+    int64_t *col_ind_buf = buffer;
+    int64_t *layer_ind_buf = col_ind_buf + blockDim.y * warpSize;
+    scalar_t *weight_buf = reinterpret_cast<scalar_t *>(layer_ind_buf + blockDim.y * warpSize);
+    col_ind_buf += threadIdx.y * warpSize;
+    layer_ind_buf += threadIdx.y * warpSize;
+    weight_buf += threadIdx.y * warpSize;
+    int64_t row = blockIdx.x * blockDim.y + threadIdx.y;
+    if (row >= num_row)
+        return;
+    int64_t d_start = blockIdx.y * warpSize * kCoarseningFactor + threadIdx.x;
+    int64_t ptr_start = row_ptr[row];
+    int64_t ptr_end = row + 1 < num_row ? row_ptr[row + 1] : nnz;
+    for (int64_t block_ptr = ptr_start; block_ptr < ptr_end; block_ptr += warpSize) {
+        int64_t ptr = block_ptr + threadIdx.x;
+        if (ptr < ptr_end) {
+            col_ind_buf[threadIdx.x] = col_ind[ptr];
+            layer_ind_buf[threadIdx.x] = layer_ind[ptr];
+            weight_buf[threadIdx.x] = weight[ptr];
+        }
+        __syncwarp();
+        int64_t max_offset = warpSize < ptr_end - block_ptr ? warpSize : ptr_end - block_ptr;
+        for (int64_t offset_ptr = 0; offset_ptr < max_offset; offset_ptr++) {
+            int64_t col = col_ind_buf[offset_ptr];
+            int64_t layer = layer_ind_buf[offset_ptr];
+            scalar_t w = weight_buf[offset_ptr];
+#pragma unroll
+            for (int64_t i = 0; i < kCoarseningFactor; i++) {
+                int64_t d = d_start + i * warpSize;
+                if (d >= dim)
+                    break;
+                scalar_t rel = relation[layer * dim + d];
+                scalar_t in = input[col * dim + d];
+                scalar_t out = output[row * dim + d];
+                scalar_t out_grad = output_grad[row * dim + d];
+                scalar_t x = BinaryOp::forward(rel, in);
+                scalar_t y = w * x;
+                scalar_t dx_drel = BinaryOp::backward_lhs(rel, in);
+                scalar_t dx_din = BinaryOp::backward_rhs(rel, in);
+                scalar_t dout_dy = NaryOp::backward(out, y);
+                scalar_t dy_dx = w;
+                atomicAdd(&relation_grad[layer * dim + d], out_grad * dout_dy * dy_dx * dx_drel);
+                atomicAdd(&input_grad[col * dim + d], out_grad * dout_dy * dy_dx * dx_din);
+            }
+        }
+        __syncwarp();
+    }
+}
+template <template<class> class NaryOp, template<class> class BinaryOp>
+Tensor rspmm_forward_cuda(const Tensor &edge_index_, const Tensor &edge_type_, const Tensor &edge_weight_,
+                          const Tensor &relation_, const Tensor &input_) {
+    constexpr const char *fn_name = "rspmm_forward_cuda";
+    TensorArg edge_index_arg(edge_index_, "edge_index", 1), edge_type_arg(edge_type_, "edge_type", 2),
+              edge_weight_arg(edge_weight_, "edge_weight", 3), relation_arg(relation_, "relation", 4),
+              input_arg(input_, "input", 5);
+    rspmm_forward_check(fn_name, edge_index_arg, edge_type_arg, edge_weight_arg, relation_arg, input_arg);
+    checkAllSameGPU(fn_name, {edge_index_arg, edge_type_arg, edge_weight_arg, relation_arg, input_arg});
+    const Tensor edge_index = edge_index_.contiguous();
+    const Tensor edge_type = edge_type_.contiguous();
+    const Tensor edge_weight = edge_weight_.contiguous();
+    const Tensor relation = relation_.contiguous();
+    const Tensor input = input_.contiguous();
+    int64_t nnz = edge_index.size(0);
+    int64_t num_row = input.size(0);
+    int64_t dim = input.size(1);
+    Tensor output = at::empty({num_row, dim}, input.options());
+    Tensor row_ind = edge_index.select(0, 0);
+    Tensor row_ptr = ind2ptr(row_ind, num_row);
+    Tensor col_ind = edge_index.select(0, 1);
+    Tensor layer_ind = edge_type;
+    cudaSetDevice(input.get_device());
+    auto stream = at::cuda::getCurrentCUDAStream();
+    const int dim_per_block = 32; // warpSize
+    const int num_dim_block = (dim + dim_per_block * kCoarseningFactor - 1) / (dim_per_block * kCoarseningFactor);
+    const int row_per_block = kThreadPerBlock / dim_per_block;
+    const int num_row_block = (num_row + row_per_block - 1) / row_per_block;
+    AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rspmm_forward_cuda", [&] {
+        const int memory_size = kThreadPerBlock * (sizeof(int64_t) * 2 + sizeof(scalar_t));
+        rspmm_forward_out_cuda<scalar_t, NaryOp<scalar_t>, BinaryOp<scalar_t>>
+            <<<dim3(num_row_block, num_dim_block), dim3(dim_per_block, row_per_block), memory_size, stream>>>(
+            row_ptr.data_ptr<int64_t>(),
+            col_ind.data_ptr<int64_t>(),
+            layer_ind.data_ptr<int64_t>(),
+            edge_weight.data_ptr<scalar_t>(),
+            relation.data_ptr<scalar_t>(),
+            input.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(),
+            num_row, nnz, dim
+        );
+    });
+    return output;
+}
+template <template<class> class NaryOp, template<class> class BinaryOp>
+std::tuple<Tensor, Tensor, Tensor> rspmm_backward_cuda(
+        const Tensor &edge_index_, const Tensor &edge_type_, const Tensor &edge_weight_,
+        const Tensor &relation_, const Tensor &input_, const Tensor &output_, const Tensor &output_grad_) {
+    constexpr const char *fn_name = "rspmm_backward_cuda";
+    TensorArg edge_index_arg(edge_index_, "edge_index", 1), edge_type_arg(edge_type_, "edge_type", 2),
+              edge_weight_arg(edge_weight_, "edge_weight", 3), relation_arg(relation_, "relation", 4),
+              input_arg(input_, "input", 5), output_arg(output_, "output", 6),
+              output_grad_arg(output_grad_, "output_grad", 7);
+    rspmm_backward_check(fn_name, edge_index_arg, edge_type_arg, edge_weight_arg, relation_arg, input_arg,
+                         output_arg, output_grad_arg);
+    checkAllSameGPU(fn_name, {edge_index_arg, edge_type_arg, edge_weight_arg, relation_arg, input_arg, output_arg,
+                              output_grad_arg});
+    const Tensor edge_index = edge_index_.contiguous();
+    const Tensor edge_type = edge_type_.contiguous();
+    const Tensor edge_weight = edge_weight_.contiguous();
+    const Tensor relation = relation_.contiguous();
+    const Tensor input = input_.contiguous();
+    const Tensor output = output_.contiguous();
+    const Tensor output_grad = output_grad_.contiguous();
+    int64_t nnz = edge_index.size(0);
+    int64_t num_row = input.size(0);
+    int64_t dim = input.size(1);
+    Tensor weight_grad = at::zeros_like(edge_weight);
+    Tensor relation_grad = at::zeros_like(relation);
+    Tensor input_grad = at::zeros_like(input);
+    Tensor row_ind = edge_index.select(0, 0);
+    Tensor row_ptr = ind2ptr(row_ind, num_row);
+    Tensor col_ind = edge_index.select(0, 1);
+    Tensor layer_ind = edge_type;
+    cudaSetDevice(input.get_device());
+    auto stream = at::cuda::getCurrentCUDAStream();
+    const int dim_per_block = 32; // warpSize
+    const int num_dim_block = (dim + dim_per_block * kCoarseningFactor - 1) / (dim_per_block * kCoarseningFactor);
+    const int row_per_block = kThreadPerBlock / dim_per_block;
+    const int num_row_block = (num_row + row_per_block - 1) / row_per_block;
+    if (edge_weight.requires_grad())
+        AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rspmm_backward_cuda", [&] {
+            const int memory_size = kThreadPerBlock * (sizeof(int64_t) * 2 + sizeof(scalar_t));
+            rspmm_backward_out_cuda<scalar_t, NaryOp<scalar_t>, BinaryOp<scalar_t>>
+                <<<dim3(num_row_block, num_dim_block), dim3(dim_per_block, row_per_block), memory_size, stream>>>(
+                row_ptr.data_ptr<int64_t>(),
+                col_ind.data_ptr<int64_t>(),
+                layer_ind.data_ptr<int64_t>(),
+                edge_weight.data_ptr<scalar_t>(),
+                relation.data_ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(),
+                output_grad.data_ptr<scalar_t>(),
+                weight_grad.data_ptr<scalar_t>(),
+                relation_grad.data_ptr<scalar_t>(),
+                input_grad.data_ptr<scalar_t>(),
+                num_row, nnz, dim
+            );
+        });
+    else
+        AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rspmm_backward_cuda", [&] {
+            const int memory_size = kThreadPerBlock * (sizeof(int64_t) * 2 + sizeof(scalar_t));
+            rspmm_backward_out_cuda<scalar_t, NaryOp<scalar_t>, BinaryOp<scalar_t>>
+                <<<dim3(num_row_block, num_dim_block), dim3(dim_per_block, row_per_block), memory_size, stream>>>(
+                row_ptr.data_ptr<int64_t>(),
+                col_ind.data_ptr<int64_t>(),
+                layer_ind.data_ptr<int64_t>(),
+                edge_weight.data_ptr<scalar_t>(),
+                relation.data_ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(),
+                output_grad.data_ptr<scalar_t>(),
+                relation_grad.data_ptr<scalar_t>(),
+                input_grad.data_ptr<scalar_t>(),
+                num_row, nnz, dim
+            );
+        });
+    return std::make_tuple(weight_grad, relation_grad, input_grad);
+}
+#define DECLARE_FORWARD_IMPL(ADD, MUL, NARYOP, BINARYOP) \
+    Tensor rspmm_##ADD##_##MUL##_forward_cuda(                                                            \
+            const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,                 \
+            const Tensor &relation, const Tensor &input) {                                                \
+        return rspmm_forward_cuda<NARYOP, BINARYOP>(edge_index, edge_type, edge_weight, relation, input); \
+    }
+#define DECLARE_BACKWARD_IMPL(ADD, MUL, NARYOP, BINARYOP) \
+    std::tuple<Tensor, Tensor, Tensor> rspmm_##ADD##_##MUL##_backward_cuda(                                 \
+            const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,                   \
+            const Tensor &relation, const Tensor &input, const Tensor &output, const Tensor &output_grad) { \
+        return rspmm_backward_cuda<NARYOP, BINARYOP>(edge_index, edge_type, edge_weight, relation, input,   \
+                                                     output, output_grad);                                  \
+    }
+DECLARE_FORWARD_IMPL(add, mul, NaryAdd, BinaryMul)
+DECLARE_BACKWARD_IMPL(add, mul, NaryAdd, BinaryMul)
+DECLARE_FORWARD_IMPL(min, mul, NaryMin, BinaryMul)
+DECLARE_BACKWARD_IMPL(min, mul, NaryMin, BinaryMul)
+DECLARE_FORWARD_IMPL(max, mul, NaryMax, BinaryMul)
+DECLARE_BACKWARD_IMPL(max, mul, NaryMax, BinaryMul)
+DECLARE_FORWARD_IMPL(add, add, NaryAdd, BinaryAdd)
+DECLARE_BACKWARD_IMPL(add, add, NaryAdd, BinaryAdd)
+DECLARE_FORWARD_IMPL(min, add, NaryMin, BinaryAdd)
+DECLARE_BACKWARD_IMPL(min, add, NaryMin, BinaryAdd)
+DECLARE_FORWARD_IMPL(max, add, NaryMax, BinaryAdd)
+DECLARE_BACKWARD_IMPL(max, add, NaryMax, BinaryAdd)
+} // namespace at

ultra/rspmm/source/rspmm.h ADDED Viewed

	@@ -0,0 +1,108 @@

+#pragma once
+#include <tuple>
+#include <torch/extension.h>
+//#include <ATen/SparseTensorUtils.h>
+#include <ATen/native/SparseTensorUtils.h>
+namespace at {
+using namespace at::sparse;
+void rspmm_forward_check(CheckedFrom c, const TensorArg &edge_index_arg, const TensorArg &edge_type_arg,
+                         const TensorArg &edge_weight_arg, const TensorArg &relation_arg, const TensorArg &input_arg);
+void rspmm_backward_check(CheckedFrom c, const TensorArg &edge_index_arg, const TensorArg &edge_type_arg,
+                          const TensorArg &edge_weight_arg, const TensorArg &relation_arg, const TensorArg &input_arg,
+                          const TensorArg &output_arg, const TensorArg &output_grad_arg);
+Tensor ind2ptr(const Tensor &index, int size);
+Tensor rspmm_add_mul_forward_cpu(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_add_mul_backward_cpu(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_min_mul_forward_cpu(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_min_mul_backward_cpu(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_max_mul_forward_cpu(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_max_mul_backward_cpu(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_add_add_forward_cpu(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_add_add_backward_cpu(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_min_add_forward_cpu(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_min_add_backward_cpu(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_max_add_forward_cpu(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_max_add_backward_cpu(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+#ifdef CUDA_OP
+Tensor rspmm_add_mul_forward_cuda(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_add_mul_backward_cuda(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_min_mul_forward_cuda(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_min_mul_backward_cuda(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_max_mul_forward_cuda(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_max_mul_backward_cuda(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_add_add_forward_cuda(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_add_add_backward_cuda(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_min_add_forward_cuda(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_min_add_backward_cuda(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+Tensor rspmm_max_add_forward_cuda(const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight,
+                                 const Tensor &relation, const Tensor &input);
+std::tuple<Tensor, Tensor, Tensor> rspmm_max_add_backward_cuda(
+        const Tensor &edge_index, const Tensor &edge_type, const Tensor &edge_weight, const Tensor &relation,
+        const Tensor &input, const Tensor &output, const Tensor &output_grad);
+#endif
+} // namespace at

ultra/rspmm/source/util.cuh ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+namespace at {
+const unsigned kFullMask = 0xFFFFFFFF;
+template <class scalar_t>
+__device__ scalar_t warp_reduce(scalar_t value) {
+#pragma unroll
+    for (int delta = 1; delta < warpSize; delta *= 2)
+#if __CUDACC_VER_MAJOR__ >= 9
+        value += __shfl_down_sync(kFullMask, value, delta);
+#else
+        value += __shfl_down(value, delta);
+#endif
+    return value;
+}
+template<class scalar_t>
+__device__ scalar_t warp_broadcast(scalar_t value, int lane_id) {
+#if __CUDACC_VER_MAJOR__ >= 9
+    return __shfl_sync(kFullMask, value, lane_id);
+#else
+    return __shfl(value, lane_id);
+#endif
+}
+} // namespace at

ultra/tasks.py ADDED Viewed

	@@ -0,0 +1,201 @@

+from functools import reduce
+from torch_scatter import scatter_add
+from torch_geometric.data import Data
+import torch
+def edge_match(edge_index, query_index):
+    # O((n + q)logn) time
+    # O(n) memory
+    # edge_index: big underlying graph
+    # query_index: edges to match
+    # preparing unique hashing of edges, base: (max_node, max_relation) + 1
+    base = edge_index.max(dim=1)[0] + 1
+    # we will map edges to long ints, so we need to make sure the maximum product is less than MAX_LONG_INT
+    # idea: max number of edges = num_nodes * num_relations
+    # e.g. for a graph of 10 nodes / 5 relations, edge IDs 0...9 mean all possible outgoing edge types from node 0
+    # given a tuple (h, r), we will search for all other existing edges starting from head h
+    assert reduce(int.__mul__, base.tolist()) < torch.iinfo(torch.long).max
+    scale = base.cumprod(0)
+    scale = scale[-1] // scale
+    # hash both the original edge index and the query index to unique integers
+    edge_hash = (edge_index * scale.unsqueeze(-1)).sum(dim=0)
+    edge_hash, order = edge_hash.sort()
+    query_hash = (query_index * scale.unsqueeze(-1)).sum(dim=0)
+    # matched ranges: [start[i], end[i])
+    start = torch.bucketize(query_hash, edge_hash)
+    end = torch.bucketize(query_hash, edge_hash, right=True)
+    # num_match shows how many edges satisfy the (h, r) pattern for each query in the batch
+    num_match = end - start
+    # generate the corresponding ranges
+    offset = num_match.cumsum(0) - num_match
+    range = torch.arange(num_match.sum(), device=edge_index.device)
+    range = range + (start - offset).repeat_interleave(num_match)
+    return order[range], num_match
+def negative_sampling(data, batch, num_negative, strict=True):
+    batch_size = len(batch)
+    pos_h_index, pos_t_index, pos_r_index = batch.t()
+    # strict negative sampling vs random negative sampling
+    if strict:
+        t_mask, h_mask = strict_negative_mask(data, batch)
+        t_mask = t_mask[:batch_size // 2]
+        neg_t_candidate = t_mask.nonzero()[:, 1]
+        num_t_candidate = t_mask.sum(dim=-1)
+        # draw samples for negative tails
+        rand = torch.rand(len(t_mask), num_negative, device=batch.device)
+        index = (rand * num_t_candidate.unsqueeze(-1)).long()
+        index = index + (num_t_candidate.cumsum(0) - num_t_candidate).unsqueeze(-1)
+        neg_t_index = neg_t_candidate[index]
+        h_mask = h_mask[batch_size // 2:]
+        neg_h_candidate = h_mask.nonzero()[:, 1]
+        num_h_candidate = h_mask.sum(dim=-1)
+        # draw samples for negative heads
+        rand = torch.rand(len(h_mask), num_negative, device=batch.device)
+        index = (rand * num_h_candidate.unsqueeze(-1)).long()
+        index = index + (num_h_candidate.cumsum(0) - num_h_candidate).unsqueeze(-1)
+        neg_h_index = neg_h_candidate[index]
+    else:
+        neg_index = torch.randint(data.num_nodes, (batch_size, num_negative), device=batch.device)
+        neg_t_index, neg_h_index = neg_index[:batch_size // 2], neg_index[batch_size // 2:]
+    h_index = pos_h_index.unsqueeze(-1).repeat(1, num_negative + 1)
+    t_index = pos_t_index.unsqueeze(-1).repeat(1, num_negative + 1)
+    r_index = pos_r_index.unsqueeze(-1).repeat(1, num_negative + 1)
+    t_index[:batch_size // 2, 1:] = neg_t_index
+    h_index[batch_size // 2:, 1:] = neg_h_index
+    return torch.stack([h_index, t_index, r_index], dim=-1)
+def all_negative(data, batch):
+    pos_h_index, pos_t_index, pos_r_index = batch.t()
+    r_index = pos_r_index.unsqueeze(-1).expand(-1, data.num_nodes)
+    # generate all negative tails for this batch
+    all_index = torch.arange(data.num_nodes, device=batch.device)
+    h_index, t_index = torch.meshgrid(pos_h_index, all_index, indexing="ij")  # indexing "xy" would return transposed
+    t_batch = torch.stack([h_index, t_index, r_index], dim=-1)
+    # generate all negative heads for this batch
+    all_index = torch.arange(data.num_nodes, device=batch.device)
+    t_index, h_index = torch.meshgrid(pos_t_index, all_index, indexing="ij")
+    h_batch = torch.stack([h_index, t_index, r_index], dim=-1)
+    return t_batch, h_batch
+def strict_negative_mask(data, batch):
+    # this function makes sure that for a given (h, r) batch we will NOT sample true tails as random negatives
+    # similarly, for a given (t, r) we will NOT sample existing true heads as random negatives
+    pos_h_index, pos_t_index, pos_r_index = batch.t()
+    # part I: sample hard negative tails
+    # edge index of all (head, relation) edges from the underlying graph
+    edge_index = torch.stack([data.edge_index[0], data.edge_type])
+    # edge index of current batch (head, relation) for which we will sample negatives
+    query_index = torch.stack([pos_h_index, pos_r_index])
+    # search for all true tails for the given (h, r) batch
+    edge_id, num_t_truth = edge_match(edge_index, query_index)
+    # build an index from the found edges
+    t_truth_index = data.edge_index[1, edge_id]
+    sample_id = torch.arange(len(num_t_truth), device=batch.device).repeat_interleave(num_t_truth)
+    t_mask = torch.ones(len(num_t_truth), data.num_nodes, dtype=torch.bool, device=batch.device)
+    # assign 0s to the mask with the found true tails
+    t_mask[sample_id, t_truth_index] = 0
+    t_mask.scatter_(1, pos_t_index.unsqueeze(-1), 0)
+    # part II: sample hard negative heads
+    # edge_index[1] denotes tails, so the edge index becomes (t, r)
+    edge_index = torch.stack([data.edge_index[1], data.edge_type])
+    # edge index of current batch (tail, relation) for which we will sample heads
+    query_index = torch.stack([pos_t_index, pos_r_index])
+    # search for all true heads for the given (t, r) batch
+    edge_id, num_h_truth = edge_match(edge_index, query_index)
+    # build an index from the found edges
+    h_truth_index = data.edge_index[0, edge_id]
+    sample_id = torch.arange(len(num_h_truth), device=batch.device).repeat_interleave(num_h_truth)
+    h_mask = torch.ones(len(num_h_truth), data.num_nodes, dtype=torch.bool, device=batch.device)
+    # assign 0s to the mask with the found true heads
+    h_mask[sample_id, h_truth_index] = 0
+    h_mask.scatter_(1, pos_h_index.unsqueeze(-1), 0)
+    return t_mask, h_mask
+def compute_ranking(pred, target, mask=None):
+    pos_pred = pred.gather(-1, target.unsqueeze(-1))
+    if mask is not None:
+        # filtered ranking
+        ranking = torch.sum((pos_pred <= pred) & mask, dim=-1) + 1
+    else:
+        # unfiltered ranking
+        ranking = torch.sum(pos_pred <= pred, dim=-1) + 1
+    return ranking
+def build_relation_graph(graph):
+    # expect the graph is already with inverse edges
+    edge_index, edge_type = graph.edge_index, graph.edge_type
+    num_nodes, num_rels = graph.num_nodes, graph.num_relations
+    device = edge_index.device
+    Eh = torch.vstack([edge_index[0], edge_type]).T.unique(dim=0)  # (num_edges, 2)
+    Dh = scatter_add(torch.ones_like(Eh[:, 1]), Eh[:, 0])
+    EhT = torch.sparse_coo_tensor(
+        torch.flip(Eh, dims=[1]).T,
+        torch.ones(Eh.shape[0], device=device) / Dh[Eh[:, 0]],
+        (num_rels, num_nodes)
+    )
+    Eh = torch.sparse_coo_tensor(
+        Eh.T,
+        torch.ones(Eh.shape[0], device=device),
+        (num_nodes, num_rels)
+    )
+    Et = torch.vstack([edge_index[1], edge_type]).T.unique(dim=0)  # (num_edges, 2)
+    Dt = scatter_add(torch.ones_like(Et[:, 1]), Et[:, 0])
+    assert not (Dt[Et[:, 0]] == 0).any()
+    EtT = torch.sparse_coo_tensor(
+        torch.flip(Et, dims=[1]).T,
+        torch.ones(Et.shape[0], device=device) / Dt[Et[:, 0]],
+        (num_rels, num_nodes)
+    )
+    Et = torch.sparse_coo_tensor(
+        Et.T,
+        torch.ones(Et.shape[0], device=device),
+        (num_nodes, num_rels)
+    )
+    Ahh = torch.sparse.mm(EhT, Eh).coalesce()
+    Att = torch.sparse.mm(EtT, Et).coalesce()
+    Aht = torch.sparse.mm(EhT, Et).coalesce()
+    Ath = torch.sparse.mm(EtT, Eh).coalesce()
+    hh_edges = torch.cat([Ahh.indices().T, torch.zeros(Ahh.indices().T.shape[0], 1, dtype=torch.long).fill_(0)], dim=1)  # head to head
+    tt_edges = torch.cat([Att.indices().T, torch.zeros(Att.indices().T.shape[0], 1, dtype=torch.long).fill_(1)], dim=1)  # tail to tail
+    ht_edges = torch.cat([Aht.indices().T, torch.zeros(Aht.indices().T.shape[0], 1, dtype=torch.long).fill_(2)], dim=1)  # head to tail
+    th_edges = torch.cat([Ath.indices().T, torch.zeros(Ath.indices().T.shape[0], 1, dtype=torch.long).fill_(3)], dim=1)  # tail to head
+    rel_graph = Data(
+        edge_index=torch.cat([hh_edges[:, [0, 1]].T, tt_edges[:, [0, 1]].T, ht_edges[:, [0, 1]].T, th_edges[:, [0, 1]].T], dim=1),
+        edge_type=torch.cat([hh_edges[:, 2], tt_edges[:, 2], ht_edges[:, 2], th_edges[:, 2]], dim=0),
+        num_nodes=num_rels,
+        num_relations=4
+    )
+    graph.relation_graph = rel_graph
+    return graph

ultra/util.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import sys
+import ast
+import copy
+import time
+import logging
+import argparse
+import yaml
+import jinja2
+from jinja2 import meta
+import easydict
+import torch
+from torch import distributed as dist
+from torch_geometric.data import Data
+from torch_geometric.datasets import RelLinkPredDataset, WordNet18RR
+from ultra import models, datasets
+logger = logging.getLogger(__file__)
+def detect_variables(cfg_file):
+    with open(cfg_file, "r") as fin:
+        raw = fin.read()
+    env = jinja2.Environment()
+    tree = env.parse(raw)
+    vars = meta.find_undeclared_variables(tree)
+    return vars
+def load_config(cfg_file, context=None):
+    with open(cfg_file, "r") as fin:
+        raw = fin.read()
+    template = jinja2.Template(raw)
+    instance = template.render(context)
+    cfg = yaml.safe_load(instance)
+    cfg = easydict.EasyDict(cfg)
+    return cfg
+def literal_eval(string):
+    try:
+        return ast.literal_eval(string)
+    except (ValueError, SyntaxError):
+        return string
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", help="yaml configuration file", required=True)
+    parser.add_argument("-s", "--seed", help="random seed for PyTorch", type=int, default=1024)
+    args, unparsed = parser.parse_known_args()
+    # get dynamic arguments defined in the config file
+    vars = detect_variables(args.config)
+    parser = argparse.ArgumentParser()
+    for var in vars:
+        parser.add_argument("--%s" % var, required=True)
+    vars = parser.parse_known_args(unparsed)[0]
+    vars = {k: literal_eval(v) for k, v in vars._get_kwargs()}
+    return args, vars
+def get_root_logger(file=True):
+    format = "%(asctime)-10s %(message)s"
+    datefmt = "%H:%M:%S"
+    logging.basicConfig(format=format, datefmt=datefmt)
+    logger = logging.getLogger("")
+    logger.setLevel(logging.INFO)
+    if file:
+        handler = logging.FileHandler("log.txt")
+        format = logging.Formatter(format, datefmt)
+        handler.setFormatter(format)
+        logger.addHandler(handler)
+    return logger
+def get_rank():
+    if dist.is_initialized():
+        return dist.get_rank()
+    if "RANK" in os.environ:
+        return int(os.environ["RANK"])
+    return 0
+def get_world_size():
+    if dist.is_initialized():
+        return dist.get_world_size()
+    if "WORLD_SIZE" in os.environ:
+        return int(os.environ["WORLD_SIZE"])
+    return 1
+def synchronize():
+    if get_world_size() > 1:
+        dist.barrier()
+def get_device(cfg):
+    if cfg.train.gpus:
+        device = torch.device(cfg.train.gpus[get_rank()])
+    else:
+        device = torch.device("cpu")
+    return device
+def get_devices(gpus):
+    if gpus is not None:
+        device = torch.device(gpus[get_rank()])
+    else:
+        device = torch.device("cpu")
+    return device
+def create_working_directory(cfg):
+    file_name = "working_dir.tmp"
+    world_size = get_world_size()
+    if cfg.train.gpus is not None and len(cfg.train.gpus) != world_size:
+        error_msg = "World size is %d but found %d GPUs in the argument"
+        if world_size == 1:
+            error_msg += ". Did you launch with `python -m torch.distributed.launch`?"
+        raise ValueError(error_msg % (world_size, len(cfg.train.gpus)))
+    if world_size > 1 and not dist.is_initialized():
+        dist.init_process_group("nccl", init_method="env://")
+    working_dir = os.path.join(os.path.expanduser(cfg.output_dir),
+                               cfg.model["class"], cfg.dataset["class"], time.strftime("%Y-%m-%d-%H-%M-%S"))
+    # synchronize working directory
+    if get_rank() == 0:
+        with open(file_name, "w") as fout:
+            fout.write(working_dir)
+        os.makedirs(working_dir)
+    synchronize()
+    if get_rank() != 0:
+        with open(file_name, "r") as fin:
+            working_dir = fin.read()
+    synchronize()
+    if get_rank() == 0:
+        os.remove(file_name)
+    os.chdir(working_dir)
+    return working_dir
+def build_dataset(cfg):
+    data_config = copy.deepcopy(cfg.dataset)
+    cls = data_config.pop("class")
+    ds_cls = getattr(datasets, cls)
+    dataset = ds_cls(**data_config)
+    if get_rank() == 0:
+        logger.warning("%s dataset" % (cls if "version" not in cfg.dataset else f'{cls}({cfg.dataset.version})'))
+        if cls != "JointDataset":
+            logger.warning("#train: %d, #valid: %d, #test: %d" %
+                        (dataset[0].target_edge_index.shape[1], dataset[1].target_edge_index.shape[1],
+                            dataset[2].target_edge_index.shape[1]))
+        else:
+            logger.warning("#train: %d, #valid: %d, #test: %d" %
+                           (sum(d.target_edge_index.shape[1] for d in dataset._data[0]),
+                            sum(d.target_edge_index.shape[1] for d in dataset._data[1]),
+                            sum(d.target_edge_index.shape[1] for d in dataset._data[2]),
+                            ))
+    return dataset