support pubmed dataset

2020-06-30 17:44:29 +08:00 · 2020-06-30 17:44:29 +08:00 · 0a07f6c909
parent 188d1fc777
commit 0a07f6c909
3 changed files with 117 additions and 0 deletions
--- a/model_zoo/utils/graph_to_mindrecord/pubmed/init.py
+++ b/model_zoo/utils/graph_to_mindrecord/pubmed/init.py
--- a/model_zoo/utils/graph_to_mindrecord/pubmed/mr_api.py
+++ b/model_zoo/utils/graph_to_mindrecord/pubmed/mr_api.py
@ -0,0 +1,105 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+User-defined API for MindRecord GNN writer.
+"""
+import os
+
+import pickle as pkl
+import numpy as np
+import scipy.sparse as sp
+
+# parse args from command line parameter 'graph_api_args'
+#     args delimiter is ':'
+args = os.environ['graph_api_args'].split(':')
+PUBMED_PATH = args[0]
+dataset_str = 'pubmed'
+
+# profile:  (num_features, feature_data_types, feature_shapes)
+node_profile = (2, ["float32", "int32"], [[-1], [-1]])
+edge_profile = (0, [], [])
+
+
+def _normalize_cora_features(features):
+    row_sum = np.array(features.sum(1))
+    r_inv = np.power(row_sum * 1.0, -1).flatten()
+    r_inv[np.isinf(r_inv)] = 0.
+    r_mat_inv = sp.diags(r_inv)
+    features = r_mat_inv.dot(features)
+    return features
+
+
+def _parse_index_file(filename):
+    """Parse index file."""
+    index = []
+    for line in open(filename):
+        index.append(int(line.strip()))
+    return index
+
+
+def yield_nodes(task_id=0):
+    """
+    Generate node data
+
+    Yields:
+        data (dict): data row which is dict.
+    """
+    print("Node task is {}".format(task_id))
+
+    names = ['tx', 'ty', 'allx', 'ally']
+    objects = []
+    for name in names:
+        with open("{}/ind.{}.{}".format(PUBMED_PATH, dataset_str, name), 'rb') as f:
+            objects.append(pkl.load(f, encoding='latin1'))
+    tx, ty, allx, ally = tuple(objects)
+    test_idx_reorder = _parse_index_file(
+        "{}/ind.{}.test.index".format(PUBMED_PATH, dataset_str))
+    test_idx_range = np.sort(test_idx_reorder)
+
+    features = sp.vstack((allx, tx)).tolil()
+    features[test_idx_reorder, :] = features[test_idx_range, :]
+    features = _normalize_cora_features(features)
+    features = features.A
+
+    labels = np.vstack((ally, ty))
+    labels[test_idx_reorder, :] = labels[test_idx_range, :]
+
+    line_count = 0
+    for i, label in enumerate(labels):
+        node = {'id': i, 'type': 0, 'feature_1': features[i].tolist(),
+                'feature_2': label.tolist().index(1)}
+        line_count += 1
+        yield node
+    print('Processed {} lines for nodes.'.format(line_count))
+
+
+def yield_edges(task_id=0):
+    """
+    Generate edge data
+
+    Yields:
+        data (dict): data row which is dict.
+    """
+    print("Edge task is {}".format(task_id))
+    with open("{}/ind.{}.graph".format(PUBMED_PATH, dataset_str), 'rb') as f:
+        graph = pkl.load(f, encoding='latin1')
+        line_count = 0
+        for i in graph:
+            for dst_id in graph[i]:
+                edge = {'id': line_count,
+                        'src_id': i, 'dst_id': dst_id, 'type': 0}
+                line_count += 1
+                yield edge
+        print('Processed {} lines for edges.'.format(line_count))
--- a/model_zoo/utils/graph_to_mindrecord/write_pubmed.sh
+++ b/model_zoo/utils/graph_to_mindrecord/write_pubmed.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+SRC_PATH=/tmp/pubmed/dataset
+MINDRECORD_PATH=/tmp/pubmed/mindrecord
+
+rm -f $MINDRECORD_PATH/*
+
+python writer.py --mindrecord_script pubmed \
+--mindrecord_file "$MINDRECORD_PATH/pubmed_mr" \
+--mindrecord_partitions 1 \
+--mindrecord_header_size_by_bit 18 \
+--mindrecord_page_size_by_bit 20 \
+--graph_api_args "$SRC_PATH"