forked from mindspore-Ecosystem/mindspore
add me daily monitor files
This commit is contained in:
parent
2e9a52fc5c
commit
7e3afac1c0
|
@ -0,0 +1,143 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
import sys
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import openpyxl as opx
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
print(sys.argv)
|
||||
me_report_path = sys.argv[1]
|
||||
log_path = sys.argv[2]
|
||||
n_iter = sys.argv[3]
|
||||
out = sys.argv[4]
|
||||
assert n_iter.isdigit()
|
||||
return me_report_path, log_path, int(n_iter), out
|
||||
|
||||
|
||||
def extract_by_keyword(doc, keyword, pattern):
|
||||
rst = []
|
||||
for i, s in enumerate(doc):
|
||||
if keyword in s:
|
||||
p = re.findall(pattern, s)
|
||||
print("L%d: extracted %s from '%s'" % (i, p, s.strip()))
|
||||
rst.extend(p)
|
||||
return rst
|
||||
|
||||
|
||||
def process_log(fname, log_path, n_iter, keyword, pattern):
|
||||
rnt = {}
|
||||
for i in range(1, 1+n_iter):
|
||||
fname_path = os.path.join(log_path, fname % i)
|
||||
with open(fname_path) as f:
|
||||
print("\nLoading %s" % fname_path)
|
||||
rst = extract_by_keyword(f, keyword, pattern)
|
||||
rnt[fname % i] = rst
|
||||
return rnt
|
||||
|
||||
|
||||
def summarize(func):
|
||||
def wrapper(*args, **kwargs):
|
||||
log = func(*args, **kwargs)
|
||||
times = list(log.items())
|
||||
times.sort(key=lambda x: x[1])
|
||||
min_file, min_time = times[0]
|
||||
avg = sum(map(lambda x: x[1], times)) / len(times)
|
||||
log["min_time"] = min_time
|
||||
log["min_file"] = min_file
|
||||
log["avg_time"] = avg
|
||||
return log
|
||||
return wrapper
|
||||
|
||||
|
||||
@summarize
|
||||
def process_bert_log(log_path, n_iter):
|
||||
fname = "bert%d.log"
|
||||
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
|
||||
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
|
||||
log = {}
|
||||
for fname in total:
|
||||
log[fname] = float(total[fname][0]) - float(task[fname][0])
|
||||
return log
|
||||
|
||||
|
||||
@summarize
|
||||
def process_resnet_log(log_path, n_iter):
|
||||
fname = "resnet%d.log"
|
||||
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
|
||||
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
|
||||
log = {}
|
||||
for fname in total:
|
||||
log[fname] = float(total[fname][0]) - float(task[fname][0])
|
||||
return log
|
||||
|
||||
|
||||
@summarize
|
||||
def process_gpt_log(log_path, n_iter):
|
||||
fname = "gpt%d.log"
|
||||
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
|
||||
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
|
||||
log = {}
|
||||
for fname in total:
|
||||
log[fname] = float(total[fname][0]) - float(task[fname][0])
|
||||
return log
|
||||
|
||||
|
||||
@summarize
|
||||
def process_reid_log(log_path, n_iter):
|
||||
log = {}
|
||||
for i in range(8):
|
||||
fname = "reid_%d_"+str(i)+".log"
|
||||
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
|
||||
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
|
||||
for fname in total:
|
||||
log[fname] = float(total[fname][0]) - float(task[fname][0])
|
||||
return log
|
||||
|
||||
|
||||
def write_to_me_report(log, me_report_path):
|
||||
wb = opx.load_workbook(me_report_path)
|
||||
sheet = wb["Sheet"]
|
||||
idx = sheet.max_row + 1
|
||||
date = time.strftime('%m%d', time.localtime())
|
||||
sheet['A%d' % idx] = date
|
||||
sheet['B%d' % idx] = round(log["reid"]["min_time"], 2)
|
||||
sheet['C%d' % idx] = round(log["bert"]["min_time"], 2)
|
||||
sheet['D%d' % idx] = round(log['resnet']["min_time"], 2)
|
||||
sheet['E%d' % idx] = round(log['gpt']["min_time"], 2)
|
||||
wb.save(me_report_path)
|
||||
|
||||
|
||||
def generate_report():
|
||||
me_report_path, log_path, n_iter, out = parse_arguments()
|
||||
log_data = {}
|
||||
bert_log = process_bert_log(log_path, n_iter)
|
||||
resnet_log = process_resnet_log(log_path, n_iter)
|
||||
gpt_log = process_gpt_log(log_path, n_iter)
|
||||
reid_log = process_reid_log(log_path, n_iter)
|
||||
log_data["bert"] = bert_log
|
||||
log_data["resnet"] = resnet_log
|
||||
log_data["gpt"] = gpt_log
|
||||
log_data["reid"] = reid_log
|
||||
with open(out, "w") as f:
|
||||
json.dump(log_data, f, indent=2)
|
||||
write_to_me_report(log_data, me_report_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
generate_report()
|
|
@ -0,0 +1,104 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import openpyxl as opx
|
||||
import matplotlib.ticker as ticker
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
log_path = sys.argv[1]
|
||||
log_data = sys.argv[2]
|
||||
me_report = sys.argv[3]
|
||||
n_days = sys.argv[4]
|
||||
assert n_days.isdigit()
|
||||
return log_path, log_data, me_report, int(n_days)
|
||||
|
||||
|
||||
def read_data(log_data, me_report_path, n_days):
|
||||
with open(log_data) as f:
|
||||
log = json.load(f)
|
||||
|
||||
wb = opx.load_workbook(me_report_path)
|
||||
sheet = wb["Sheet"]
|
||||
n_row = sheet.max_row
|
||||
date = [cell[0].value for cell in sheet["A2":"A%d" % n_row]]
|
||||
reid_data = [float(cell[0].value) for cell in sheet["B2":"B%d" % n_row]]
|
||||
bert_data = [float(cell[0].value) for cell in sheet["C2":"C%d" % n_row]]
|
||||
resnet_data = [float(cell[0].value) for cell in sheet["D2":"D%d" % n_row]]
|
||||
gpt_data = [float(cell[0].value) for cell in sheet["E43":"E%d" % n_row]]
|
||||
if n_days > 0:
|
||||
date = date[-n_days:]
|
||||
reid_data = reid_data[-n_days:]
|
||||
bert_data = bert_data[-n_days:]
|
||||
resnet_data = resnet_data[-n_days:]
|
||||
gpt_data = gpt_data[-n_days:]
|
||||
|
||||
return log, date, reid_data, bert_data, resnet_data, gpt_data
|
||||
|
||||
|
||||
def draw_figure(x_data, y_data, labels, title, out, height=24, width=8, tick_space=2):
|
||||
print("Generating figure to: %s" % out)
|
||||
plt.figure(figsize=(height, width))
|
||||
for y, label in zip(y_data, labels):
|
||||
x = x_data[-len(y):]
|
||||
n_data = len(x)
|
||||
assert len(x) == len(
|
||||
y), "assume len(x) == len(y), while %d != %d" % (len(x), len(y))
|
||||
plt.plot(x, y, linewidth=2, marker='o', markersize=5, label=label)
|
||||
ax = plt.gca()
|
||||
ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_space))
|
||||
for i in range(n_data):
|
||||
if i % 2 == 0:
|
||||
plt.text(x[i], y[i], y[i], ha='center',
|
||||
va='bottom', fontsize=8)
|
||||
|
||||
plt.title(title)
|
||||
plt.xlabel("Date")
|
||||
plt.ylabel("Time(s)")
|
||||
plt.grid()
|
||||
plt.legend()
|
||||
plt.savefig(out)
|
||||
|
||||
|
||||
def generate_report(log, labels, log_path):
|
||||
for label in labels:
|
||||
fname = log[label]["min_file"]
|
||||
fname_path = os.path.join(log_path, fname)
|
||||
out_path = os.path.join(log_path, "reports", label+"_me.log")
|
||||
print("Generating report to: %s" % out_path)
|
||||
os.system("grep -A 230 'TotalTime = ' %s > %s" %
|
||||
(fname_path, out_path))
|
||||
|
||||
|
||||
def process_data():
|
||||
log_path, log_data, me_report, n_days = parse_arguments()
|
||||
log, date, reid_data, bert_data, resnet_data, gpt_data = read_data(
|
||||
log_data, me_report, n_days)
|
||||
draw_figure(date,
|
||||
[reid_data, bert_data, gpt_data],
|
||||
["ReID", "BERT", "GPT"],
|
||||
"ReID&BERT&GPT",
|
||||
os.path.join(log_path, "reports", "reid_bert_gpt.png")
|
||||
)
|
||||
draw_figure(date, [resnet_data], ["ResNet"], "ResNet",
|
||||
os.path.join(log_path, "reports", "resnet.png"))
|
||||
generate_report(log, list(log.keys()), log_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_data()
|
|
@ -0,0 +1,146 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
stage=0
|
||||
days=7
|
||||
iter=5
|
||||
device_id=0
|
||||
n_worker=128
|
||||
work_dir="/opt/npu/me_monitor"
|
||||
me_report_path=$work_dir/logs/ME_report_daily.xlsx
|
||||
log_dir=logs_$(date "+%m%d-%H%M")
|
||||
log_path=$work_dir/logs/$log_dir
|
||||
ms_master="https://gitee.com/mindspore/mindspore.git"
|
||||
log_data="data.json"
|
||||
ci_mode=true
|
||||
|
||||
set -e
|
||||
set -o pipefail
|
||||
|
||||
# parse arguments from command line
|
||||
while getopts "s:d:i:l:" args
|
||||
do
|
||||
case $args in
|
||||
s)
|
||||
stage=$OPTARG
|
||||
;;
|
||||
d)
|
||||
days=$OPTARG
|
||||
;;
|
||||
i)
|
||||
iter=$OPTARG
|
||||
;;
|
||||
l)
|
||||
log_dir=$OPTARG
|
||||
log_path=$work_dir/logs/$log_dir
|
||||
;;
|
||||
?)
|
||||
echo "unknown argument"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
source env.sh
|
||||
export DEVICE_ID=$device_id
|
||||
echo "Args: days=$days, iter=$iter, log_path=$log_path"
|
||||
cd $work_dir
|
||||
|
||||
echo $WORKSPACE
|
||||
WORKSPACE=/home/jenkins-slave/workspace/MindSpore_Network_reid_compile_performance
|
||||
echo $WORKSPACE
|
||||
|
||||
if [ $stage -le 1 ]; then
|
||||
echo ""
|
||||
echo "===========Stage 1: Fetching latest mindspore from master==========="
|
||||
if [ -d mindspore ]; then
|
||||
rm -rf mindspore
|
||||
fi
|
||||
git clone $ms_master
|
||||
fi
|
||||
|
||||
if [ $stage -le 2 ]; then
|
||||
echo ""
|
||||
echo "===========Stage 2: Building mindspore==========="
|
||||
cd $work_dir/mindspore
|
||||
bash build.sh -e ascend -j $n_worker -p on
|
||||
fi
|
||||
|
||||
if [ $stage -le 3 ]; then
|
||||
echo ""
|
||||
echo "===========Stage 3: Compiling networks==========="
|
||||
cd $work_dir
|
||||
mkdir -p $log_path
|
||||
|
||||
# Compiling ReID-8
|
||||
# split resource-consuming task from others
|
||||
for count in $(seq 1 $iter); do
|
||||
echo "[INFO] Compiling ReID-8p, iteration $count"
|
||||
if [ -d reid$count ]; then
|
||||
rm -rf reid$count
|
||||
fi
|
||||
mkdir reid$count
|
||||
cd reid$count
|
||||
bash $work_dir/faceReidToMe/dist_env/env_26/dist_env_26.sh
|
||||
for num in {0..7}; do
|
||||
cp device_$num/test_reid_stage123_1024node_graphdata_dynamiclossscale_log$num.log $log_path/reid_${count}_${num}.log
|
||||
done
|
||||
cd $work_dir
|
||||
mv reid$count $log_path
|
||||
done
|
||||
|
||||
# Compiling BERT
|
||||
cd $work_dir
|
||||
for count in $(seq 1 $iter); do
|
||||
echo "[INFO] Compiling BERT, iteration $count"
|
||||
pytest -s mindspore/tests/perf_test/bert/test_bert_train.py::test_bert_train | tee $log_path/bert$count.log
|
||||
done
|
||||
|
||||
# Compiling ResNet50
|
||||
for count in $(seq 1 $iter); do
|
||||
echo "[INFO] Compiling ResNet50, iteration $count"
|
||||
pytest -s mindspore/tests/perf_test/test_resnet_train.py::test_train_step | tee $log_path/resnet$count.log
|
||||
done
|
||||
|
||||
# Compiling GPT
|
||||
for count in $(seq 1 $iter); do
|
||||
echo "[INFO] Compiling GPT, iteration $count"
|
||||
cd gpt
|
||||
bash scripts/run_standalone_train.sh 0 1 $work_dir/gpt_data | tee $log_path/gpt$count.log
|
||||
done
|
||||
fi
|
||||
|
||||
if [ $stage -le 4 ]; then
|
||||
echo ""
|
||||
echo "===========Stage 4: Processing log files==========="
|
||||
cd $work_dir
|
||||
python process_data.py $me_report_path $log_path $iter $log_path/$log_data
|
||||
fi
|
||||
|
||||
if [ $stage -le 5 ]; then
|
||||
echo ""
|
||||
echo "===========Stage 5: Generating reports==========="
|
||||
if [ ! -d $log_path/reports ]; then
|
||||
mkdir $log_path/reports
|
||||
fi
|
||||
python generate_report.py $log_path $log_path/$log_data $me_report_path $days
|
||||
|
||||
if [ $ci_mode ]; then
|
||||
echo "copying file to artifacts"
|
||||
mkdir -p ${WORKSPACE}/archive
|
||||
cp $log_path/reports/* ${WORKSPACE}/archive
|
||||
fi
|
||||
fi
|
Loading…
Reference in New Issue