forked from mindspore-Ecosystem/mindspore
support pubmed dataset
This commit is contained in:
parent
188d1fc777
commit
0a07f6c909
|
@ -0,0 +1,105 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
"""
|
||||
User-defined API for MindRecord GNN writer.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pickle as pkl
|
||||
import numpy as np
|
||||
import scipy.sparse as sp
|
||||
|
||||
# parse args from command line parameter 'graph_api_args'
|
||||
# args delimiter is ':'
|
||||
args = os.environ['graph_api_args'].split(':')
|
||||
PUBMED_PATH = args[0]
|
||||
dataset_str = 'pubmed'
|
||||
|
||||
# profile: (num_features, feature_data_types, feature_shapes)
|
||||
node_profile = (2, ["float32", "int32"], [[-1], [-1]])
|
||||
edge_profile = (0, [], [])
|
||||
|
||||
|
||||
def _normalize_cora_features(features):
|
||||
row_sum = np.array(features.sum(1))
|
||||
r_inv = np.power(row_sum * 1.0, -1).flatten()
|
||||
r_inv[np.isinf(r_inv)] = 0.
|
||||
r_mat_inv = sp.diags(r_inv)
|
||||
features = r_mat_inv.dot(features)
|
||||
return features
|
||||
|
||||
|
||||
def _parse_index_file(filename):
|
||||
"""Parse index file."""
|
||||
index = []
|
||||
for line in open(filename):
|
||||
index.append(int(line.strip()))
|
||||
return index
|
||||
|
||||
|
||||
def yield_nodes(task_id=0):
|
||||
"""
|
||||
Generate node data
|
||||
|
||||
Yields:
|
||||
data (dict): data row which is dict.
|
||||
"""
|
||||
print("Node task is {}".format(task_id))
|
||||
|
||||
names = ['tx', 'ty', 'allx', 'ally']
|
||||
objects = []
|
||||
for name in names:
|
||||
with open("{}/ind.{}.{}".format(PUBMED_PATH, dataset_str, name), 'rb') as f:
|
||||
objects.append(pkl.load(f, encoding='latin1'))
|
||||
tx, ty, allx, ally = tuple(objects)
|
||||
test_idx_reorder = _parse_index_file(
|
||||
"{}/ind.{}.test.index".format(PUBMED_PATH, dataset_str))
|
||||
test_idx_range = np.sort(test_idx_reorder)
|
||||
|
||||
features = sp.vstack((allx, tx)).tolil()
|
||||
features[test_idx_reorder, :] = features[test_idx_range, :]
|
||||
features = _normalize_cora_features(features)
|
||||
features = features.A
|
||||
|
||||
labels = np.vstack((ally, ty))
|
||||
labels[test_idx_reorder, :] = labels[test_idx_range, :]
|
||||
|
||||
line_count = 0
|
||||
for i, label in enumerate(labels):
|
||||
node = {'id': i, 'type': 0, 'feature_1': features[i].tolist(),
|
||||
'feature_2': label.tolist().index(1)}
|
||||
line_count += 1
|
||||
yield node
|
||||
print('Processed {} lines for nodes.'.format(line_count))
|
||||
|
||||
|
||||
def yield_edges(task_id=0):
|
||||
"""
|
||||
Generate edge data
|
||||
|
||||
Yields:
|
||||
data (dict): data row which is dict.
|
||||
"""
|
||||
print("Edge task is {}".format(task_id))
|
||||
with open("{}/ind.{}.graph".format(PUBMED_PATH, dataset_str), 'rb') as f:
|
||||
graph = pkl.load(f, encoding='latin1')
|
||||
line_count = 0
|
||||
for i in graph:
|
||||
for dst_id in graph[i]:
|
||||
edge = {'id': line_count,
|
||||
'src_id': i, 'dst_id': dst_id, 'type': 0}
|
||||
line_count += 1
|
||||
yield edge
|
||||
print('Processed {} lines for edges.'.format(line_count))
|
|
@ -0,0 +1,12 @@
|
|||
#!/bin/bash
|
||||
SRC_PATH=/tmp/pubmed/dataset
|
||||
MINDRECORD_PATH=/tmp/pubmed/mindrecord
|
||||
|
||||
rm -f $MINDRECORD_PATH/*
|
||||
|
||||
python writer.py --mindrecord_script pubmed \
|
||||
--mindrecord_file "$MINDRECORD_PATH/pubmed_mr" \
|
||||
--mindrecord_partitions 1 \
|
||||
--mindrecord_header_size_by_bit 18 \
|
||||
--mindrecord_page_size_by_bit 20 \
|
||||
--graph_api_args "$SRC_PATH"
|
Loading…
Reference in New Issue