diff --git a/tests/st/gnn/aggregator.py b/tests/st/gnn/aggregator.py
index 18f189d979f..5e208a2329a 100644
--- a/tests/st/gnn/aggregator.py
+++ b/tests/st/gnn/aggregator.py
@@ -64,7 +64,7 @@ class GNNFeatureTransform(nn.Cell):
         [[ 2.5246444   2.2738023   0.5711005  -3.9399147 ]
          [ 1.0739875   4.0155234   0.94188046 -5.459526  ]]
     """
-    @cell_attr_register(attrs=['has_bias', 'activation'])
+    @cell_attr_register
     def __init__(self,
                  in_channels,
                  out_channels,
@@ -125,7 +125,7 @@ class _BaseAggregator(nn.Cell):
             same as input x. The values of str refer to the function `initializer`. Default: 'zeros'.
         has_bias (bool): Specifies whether the layer uses a bias vector. Default: True.
         dropout_ratio (float): The keep rate of dropout layer, greater than 0 and less equal than 1. Default: None.
-        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.        
+        activation (str): Regularizer function applied to the output of the layer, eg. 'relu'. Default: None.
 
     Examples:
         >>> class MyAggregator(_BaseAggregator):
@@ -203,12 +203,12 @@ class MeanAggregator(_BaseAggregator):
         super(MeanAggregator, self).__init__(
             feature_in_dim,
             feature_out_dim,
-            use_fc=True,
-            weight_init="normal",
-            bias_init="zeros",
-            has_bias=True,
-            dropout_ratio=None,
-            activation=None)
+            use_fc,
+            weight_init,
+            bias_init,
+            has_bias,
+            dropout_ratio,
+            activation)
         self.reduce_mean = P.ReduceMean(keep_dims=False)
 
     def construct(self, input_feature):
@@ -220,3 +220,157 @@ class MeanAggregator(_BaseAggregator):
             input_feature = self.activation(input_feature)
         output_feature = self.reduce_mean(input_feature, 1)
         return output_feature
+
+
+class AttentionHead(nn.Cell):
+    """
+    Attention Head for Graph Attention Networks.
+
+    Args:
+        in_channel (int): The number of input channel, input feature dim.
+        out_channel (int): The number of output channel, output feature dim.
+        in_drop_ratio (float): Input feature dropout ratio, default 0.0.
+        coef_drop_ratio (float): Coefficient dropout ratio, default 0.0.
+        residual (bool): Whether to use residual connection, default False.
+        coef_activation (Cell): The attention coefficient activation function,
+            default nn.LeakyReLU().
+        activation (Cell): The output activation function, default nn.ELU().
+
+    Inputs:
+        - **input_feature** (Tensor) - Tensor of shape : (batch_size, num_nodes, feature_dim).
+        - **bias_mat** (Tensor) - Tensor of shape : (batch_size, num_nodes, num_nodes).
+
+    Examples:
+        >>> head = AttentionHead(1433,
+                                 8,
+                                 in_drop_ratio=0.6,
+                                 coef_drop_ratio=0.6,
+                                 residual=False)
+        >>> input_data = Tensor(np.array(np.random.rand(1, 2708, 1433), dtypy=np.float32))
+        >>> output = net(input_data)
+    """
+
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 in_drop_ratio=0.0,
+                 coef_drop_ratio=0.0,
+                 residual=False,
+                 coef_activation=nn.LeakyReLU(),
+                 activation=nn.ELU()):
+        super(AttentionHead, self).__init__()
+        self.in_channel = check_int_positive(in_channel)
+        self.out_channel = check_int_positive(out_channel)
+        self.in_drop_ratio = in_drop_ratio
+        self.in_drop = nn.Dropout(keep_prob=1 - in_drop_ratio)
+        self.in_drop_2 = nn.Dropout(keep_prob=1 - in_drop_ratio)
+        self.feature_transform = GNNFeatureTransform(
+            in_channels=self.in_channel,
+            out_channels=self.out_channel,
+            has_bias=False)
+
+        self.f_1_transform = GNNFeatureTransform(
+            in_channels=self.out_channel,
+            out_channels=1)
+        self.f_2_transform = GNNFeatureTransform(
+            in_channels=self.out_channel,
+            out_channels=1)
+        self.softmax = nn.Softmax()
+
+        self.coef_drop = nn.Dropout(keep_prob=1 - coef_drop_ratio)
+        self.batch_matmul = P.BatchMatMul()
+        self.bias_add = P.BiasAdd()
+        self.bias = Parameter(initializer('zeros', self.out_channel), name='bias')
+        self.residual = check_bool(residual)
+        if self.residual:
+            if in_channel != out_channel:
+                self.residual_transform_flag = True
+                self.residual_transform = GNNFeatureTransform(
+                    in_channels=self.in_channel,
+                    out_channels=self.out_channel)
+            else:
+                self.residual_transform = None
+        self.coef_activation = coef_activation
+        self.activation = activation
+
+    def construct(self, input_feature, bias_mat):
+        input_feature = self.in_drop(input_feature)
+
+        feature = self.feature_transform(input_feature)
+        # self attention following the author
+        f_1 = self.f_1_transform(feature)
+        f_2 = self.f_2_transform(feature)
+        logits = f_1 + P.Transpose()(f_2, (0, 2, 1))
+        logits = self.coef_activation(logits) + bias_mat
+        coefs = self.softmax(logits)
+
+        coefs = self.coef_drop(coefs)
+        feature = self.in_drop_2(feature)
+
+        ret = self.batch_matmul(coefs, feature)
+        ret = P.Squeeze(0)(ret)
+        ret = self.bias_add(ret, self.bias)
+        ret = P.ExpandDims()(ret, 0)
+        # residual connection
+        if self.residual:
+            if self.residual_transform_flag:
+                res = self.residual_transform(input_feature)
+                ret = ret + res
+            else:
+                ret = ret + input_feature
+        # activation
+        ret = self.activation(ret)
+        return ret
+
+
+class AttentionAggregator(nn.Cell):
+    """
+    Attention Head for Graph Attention Networks，can be regarded as one
+        GAT layer.
+
+    Args:
+        in_channel (int): Input channel.
+        out_channel (int): Output channel.
+        num_heads (int): Number of attention heads for this layer, default 1.
+        in_drop_ratio (float): Input feature dropout ratio, default 0.0.
+        coef_drop_ratio (float): Coefficient dropout ratio, default 0.0.
+        activation (Cell): The output activation function, default nn.ELU().
+        residual (bool): Whether to use residual connection, default False.
+
+    Inputs:
+        - **input_feature** (Tensor) - Tensor of shape : (batch_size, num_nodes, feature_dim).
+        - **bias_mat** (Tensor) - Tensor of shape : (batch_size, num_nodes, num_nodes).
+
+    Examples:
+        >>> input_data = Tensor(np.array(np.random.rand(1, 2708, 1433), dtype=np.float32))
+        >>> biases = Tensor(np.array(np.random.rand(1, 2708, 2708), dtype=np.float32))
+        >>> net = AttentionAggregator(1433,
+                                      8,
+                                      8)
+        >>> net(input_data, biases)
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_heads=1,
+                 in_drop=0.0,
+                 coef_drop=0.0,
+                 activation=nn.ELU(),
+                 residual=False):
+        super(AttentionAggregator, self).__init__()
+        self.num_heads = num_heads
+        self.attns = []
+        for _ in range(num_heads):
+            self.attns.append(AttentionHead(in_channels,
+                                            out_channels,
+                                            in_drop_ratio=in_drop,
+                                            coef_drop_ratio=coef_drop,
+                                            activation=activation,
+                                            residual=residual))
+        self.attns = nn.layer.CellList(self.attns)
+
+    def construct(self, input_data, bias_mat):
+        res = ()
+        for i in range(self.num_heads):
+            res += (self.attns[i](input_data, bias_mat),)
+        return P.Concat(-1)(res)
diff --git a/tests/st/gnn/test_gnn_aggregator.py b/tests/st/gnn/test_gnn_aggregator.py
index bba7c09c311..6335b4c8327 100644
--- a/tests/st/gnn/test_gnn_aggregator.py
+++ b/tests/st/gnn/test_gnn_aggregator.py
@@ -20,7 +20,7 @@ import mindspore.context as context
 from mindspore import Tensor
 from mindspore.common.api import _executor
 import mindspore.ops.composite as C
-from aggregator import MeanAggregator
+from aggregator import MeanAggregator, AttentionHead, AttentionAggregator
 
 context.set_context(mode=context.GRAPH_MODE)
 
@@ -51,3 +51,22 @@ def test_MeanAggregator_grad():
     sens = Tensor(np.ones([32, 64]).astype(np.float32))
     grad_op = MeanAggregatorGrad(aggregator)
     _executor.compile(grad_op, input_data, sens)
+
+
+def test_AttentionHead():
+    """Compile AttentionHead forward graph"""
+    head = AttentionHead(1433,
+                         8,
+                         in_drop_ratio=0.6,
+                         coef_drop_ratio=0.6,
+                         residual=False)
+    input_data = Tensor(np.array(np.random.rand(1, 2708, 1433), dtype=np.float32))
+    biases = Tensor(np.array(np.random.rand(1, 2708, 2708), dtype=np.float32))
+    _executor.compile(head, input_data, biases)
+
+
+def test_AttentionAggregator():
+    input_data = Tensor(np.array(np.random.rand(1, 2708, 1433), dtype=np.float32))
+    biases = Tensor(np.array(np.random.rand(1, 2708, 2708), dtype=np.float32))
+    net = AttentionAggregator(1433, 8, 8)
+    _executor.compile(net, input_data, biases)