support pubmed dataset

This commit is contained in:
heleiwang 2020-06-30 17:44:29 +08:00
parent 188d1fc777
commit 0a07f6c909
3 changed files with 117 additions and 0 deletions

View File

@ -0,0 +1,105 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
User-defined API for MindRecord GNN writer.
"""
import os
import pickle as pkl
import numpy as np
import scipy.sparse as sp
# parse args from command line parameter 'graph_api_args'
# args delimiter is ':'
args = os.environ['graph_api_args'].split(':')
PUBMED_PATH = args[0]
dataset_str = 'pubmed'
# profile: (num_features, feature_data_types, feature_shapes)
node_profile = (2, ["float32", "int32"], [[-1], [-1]])
edge_profile = (0, [], [])
def _normalize_cora_features(features):
row_sum = np.array(features.sum(1))
r_inv = np.power(row_sum * 1.0, -1).flatten()
r_inv[np.isinf(r_inv)] = 0.
r_mat_inv = sp.diags(r_inv)
features = r_mat_inv.dot(features)
return features
def _parse_index_file(filename):
"""Parse index file."""
index = []
for line in open(filename):
index.append(int(line.strip()))
return index
def yield_nodes(task_id=0):
"""
Generate node data
Yields:
data (dict): data row which is dict.
"""
print("Node task is {}".format(task_id))
names = ['tx', 'ty', 'allx', 'ally']
objects = []
for name in names:
with open("{}/ind.{}.{}".format(PUBMED_PATH, dataset_str, name), 'rb') as f:
objects.append(pkl.load(f, encoding='latin1'))
tx, ty, allx, ally = tuple(objects)
test_idx_reorder = _parse_index_file(
"{}/ind.{}.test.index".format(PUBMED_PATH, dataset_str))
test_idx_range = np.sort(test_idx_reorder)
features = sp.vstack((allx, tx)).tolil()
features[test_idx_reorder, :] = features[test_idx_range, :]
features = _normalize_cora_features(features)
features = features.A
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
line_count = 0
for i, label in enumerate(labels):
node = {'id': i, 'type': 0, 'feature_1': features[i].tolist(),
'feature_2': label.tolist().index(1)}
line_count += 1
yield node
print('Processed {} lines for nodes.'.format(line_count))
def yield_edges(task_id=0):
"""
Generate edge data
Yields:
data (dict): data row which is dict.
"""
print("Edge task is {}".format(task_id))
with open("{}/ind.{}.graph".format(PUBMED_PATH, dataset_str), 'rb') as f:
graph = pkl.load(f, encoding='latin1')
line_count = 0
for i in graph:
for dst_id in graph[i]:
edge = {'id': line_count,
'src_id': i, 'dst_id': dst_id, 'type': 0}
line_count += 1
yield edge
print('Processed {} lines for edges.'.format(line_count))

View File

@ -0,0 +1,12 @@
#!/bin/bash
SRC_PATH=/tmp/pubmed/dataset
MINDRECORD_PATH=/tmp/pubmed/mindrecord
rm -f $MINDRECORD_PATH/*
python writer.py --mindrecord_script pubmed \
--mindrecord_file "$MINDRECORD_PATH/pubmed_mr" \
--mindrecord_partitions 1 \
--mindrecord_header_size_by_bit 18 \
--mindrecord_page_size_by_bit 20 \
--graph_api_args "$SRC_PATH"