!2789 support pubmed dataset
Merge pull request !2789 from heleiwang/support_pubmed
This commit is contained in:
commit
c96046d94b
|
@ -0,0 +1,105 @@
|
||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ==============================================================================
|
||||||
|
"""
|
||||||
|
User-defined API for MindRecord GNN writer.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pickle as pkl
|
||||||
|
import numpy as np
|
||||||
|
import scipy.sparse as sp
|
||||||
|
|
||||||
|
# parse args from command line parameter 'graph_api_args'
|
||||||
|
# args delimiter is ':'
|
||||||
|
args = os.environ['graph_api_args'].split(':')
|
||||||
|
PUBMED_PATH = args[0]
|
||||||
|
dataset_str = 'pubmed'
|
||||||
|
|
||||||
|
# profile: (num_features, feature_data_types, feature_shapes)
|
||||||
|
node_profile = (2, ["float32", "int32"], [[-1], [-1]])
|
||||||
|
edge_profile = (0, [], [])
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_cora_features(features):
|
||||||
|
row_sum = np.array(features.sum(1))
|
||||||
|
r_inv = np.power(row_sum * 1.0, -1).flatten()
|
||||||
|
r_inv[np.isinf(r_inv)] = 0.
|
||||||
|
r_mat_inv = sp.diags(r_inv)
|
||||||
|
features = r_mat_inv.dot(features)
|
||||||
|
return features
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_index_file(filename):
|
||||||
|
"""Parse index file."""
|
||||||
|
index = []
|
||||||
|
for line in open(filename):
|
||||||
|
index.append(int(line.strip()))
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def yield_nodes(task_id=0):
|
||||||
|
"""
|
||||||
|
Generate node data
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
data (dict): data row which is dict.
|
||||||
|
"""
|
||||||
|
print("Node task is {}".format(task_id))
|
||||||
|
|
||||||
|
names = ['tx', 'ty', 'allx', 'ally']
|
||||||
|
objects = []
|
||||||
|
for name in names:
|
||||||
|
with open("{}/ind.{}.{}".format(PUBMED_PATH, dataset_str, name), 'rb') as f:
|
||||||
|
objects.append(pkl.load(f, encoding='latin1'))
|
||||||
|
tx, ty, allx, ally = tuple(objects)
|
||||||
|
test_idx_reorder = _parse_index_file(
|
||||||
|
"{}/ind.{}.test.index".format(PUBMED_PATH, dataset_str))
|
||||||
|
test_idx_range = np.sort(test_idx_reorder)
|
||||||
|
|
||||||
|
features = sp.vstack((allx, tx)).tolil()
|
||||||
|
features[test_idx_reorder, :] = features[test_idx_range, :]
|
||||||
|
features = _normalize_cora_features(features)
|
||||||
|
features = features.A
|
||||||
|
|
||||||
|
labels = np.vstack((ally, ty))
|
||||||
|
labels[test_idx_reorder, :] = labels[test_idx_range, :]
|
||||||
|
|
||||||
|
line_count = 0
|
||||||
|
for i, label in enumerate(labels):
|
||||||
|
node = {'id': i, 'type': 0, 'feature_1': features[i].tolist(),
|
||||||
|
'feature_2': label.tolist().index(1)}
|
||||||
|
line_count += 1
|
||||||
|
yield node
|
||||||
|
print('Processed {} lines for nodes.'.format(line_count))
|
||||||
|
|
||||||
|
|
||||||
|
def yield_edges(task_id=0):
|
||||||
|
"""
|
||||||
|
Generate edge data
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
data (dict): data row which is dict.
|
||||||
|
"""
|
||||||
|
print("Edge task is {}".format(task_id))
|
||||||
|
with open("{}/ind.{}.graph".format(PUBMED_PATH, dataset_str), 'rb') as f:
|
||||||
|
graph = pkl.load(f, encoding='latin1')
|
||||||
|
line_count = 0
|
||||||
|
for i in graph:
|
||||||
|
for dst_id in graph[i]:
|
||||||
|
edge = {'id': line_count,
|
||||||
|
'src_id': i, 'dst_id': dst_id, 'type': 0}
|
||||||
|
line_count += 1
|
||||||
|
yield edge
|
||||||
|
print('Processed {} lines for edges.'.format(line_count))
|
|
@ -0,0 +1,12 @@
|
||||||
|
#!/bin/bash
|
||||||
|
SRC_PATH=/tmp/pubmed/dataset
|
||||||
|
MINDRECORD_PATH=/tmp/pubmed/mindrecord
|
||||||
|
|
||||||
|
rm -f $MINDRECORD_PATH/*
|
||||||
|
|
||||||
|
python writer.py --mindrecord_script pubmed \
|
||||||
|
--mindrecord_file "$MINDRECORD_PATH/pubmed_mr" \
|
||||||
|
--mindrecord_partitions 1 \
|
||||||
|
--mindrecord_header_size_by_bit 18 \
|
||||||
|
--mindrecord_page_size_by_bit 20 \
|
||||||
|
--graph_api_args "$SRC_PATH"
|
Loading…
Reference in New Issue