add me daily monitor files

This commit is contained in:
huangbingjian 2021-02-09 09:58:19 +08:00
parent 2e9a52fc5c
commit 7e3afac1c0
3 changed files with 393 additions and 0 deletions

View File

@ -0,0 +1,143 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import sys
import re
import json
import os
import time
import openpyxl as opx
def parse_arguments():
print(sys.argv)
me_report_path = sys.argv[1]
log_path = sys.argv[2]
n_iter = sys.argv[3]
out = sys.argv[4]
assert n_iter.isdigit()
return me_report_path, log_path, int(n_iter), out
def extract_by_keyword(doc, keyword, pattern):
rst = []
for i, s in enumerate(doc):
if keyword in s:
p = re.findall(pattern, s)
print("L%d: extracted %s from '%s'" % (i, p, s.strip()))
rst.extend(p)
return rst
def process_log(fname, log_path, n_iter, keyword, pattern):
rnt = {}
for i in range(1, 1+n_iter):
fname_path = os.path.join(log_path, fname % i)
with open(fname_path) as f:
print("\nLoading %s" % fname_path)
rst = extract_by_keyword(f, keyword, pattern)
rnt[fname % i] = rst
return rnt
def summarize(func):
def wrapper(*args, **kwargs):
log = func(*args, **kwargs)
times = list(log.items())
times.sort(key=lambda x: x[1])
min_file, min_time = times[0]
avg = sum(map(lambda x: x[1], times)) / len(times)
log["min_time"] = min_time
log["min_file"] = min_file
log["avg_time"] = avg
return log
return wrapper
@summarize
def process_bert_log(log_path, n_iter):
fname = "bert%d.log"
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
log = {}
for fname in total:
log[fname] = float(total[fname][0]) - float(task[fname][0])
return log
@summarize
def process_resnet_log(log_path, n_iter):
fname = "resnet%d.log"
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
log = {}
for fname in total:
log[fname] = float(total[fname][0]) - float(task[fname][0])
return log
@summarize
def process_gpt_log(log_path, n_iter):
fname = "gpt%d.log"
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
log = {}
for fname in total:
log[fname] = float(total[fname][0]) - float(task[fname][0])
return log
@summarize
def process_reid_log(log_path, n_iter):
log = {}
for i in range(8):
fname = "reid_%d_"+str(i)+".log"
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
for fname in total:
log[fname] = float(total[fname][0]) - float(task[fname][0])
return log
def write_to_me_report(log, me_report_path):
wb = opx.load_workbook(me_report_path)
sheet = wb["Sheet"]
idx = sheet.max_row + 1
date = time.strftime('%m%d', time.localtime())
sheet['A%d' % idx] = date
sheet['B%d' % idx] = round(log["reid"]["min_time"], 2)
sheet['C%d' % idx] = round(log["bert"]["min_time"], 2)
sheet['D%d' % idx] = round(log['resnet']["min_time"], 2)
sheet['E%d' % idx] = round(log['gpt']["min_time"], 2)
wb.save(me_report_path)
def generate_report():
me_report_path, log_path, n_iter, out = parse_arguments()
log_data = {}
bert_log = process_bert_log(log_path, n_iter)
resnet_log = process_resnet_log(log_path, n_iter)
gpt_log = process_gpt_log(log_path, n_iter)
reid_log = process_reid_log(log_path, n_iter)
log_data["bert"] = bert_log
log_data["resnet"] = resnet_log
log_data["gpt"] = gpt_log
log_data["reid"] = reid_log
with open(out, "w") as f:
json.dump(log_data, f, indent=2)
write_to_me_report(log_data, me_report_path)
if __name__ == "__main__":
generate_report()

View File

@ -0,0 +1,104 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import os
import sys
import json
import openpyxl as opx
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
def parse_arguments():
log_path = sys.argv[1]
log_data = sys.argv[2]
me_report = sys.argv[3]
n_days = sys.argv[4]
assert n_days.isdigit()
return log_path, log_data, me_report, int(n_days)
def read_data(log_data, me_report_path, n_days):
with open(log_data) as f:
log = json.load(f)
wb = opx.load_workbook(me_report_path)
sheet = wb["Sheet"]
n_row = sheet.max_row
date = [cell[0].value for cell in sheet["A2":"A%d" % n_row]]
reid_data = [float(cell[0].value) for cell in sheet["B2":"B%d" % n_row]]
bert_data = [float(cell[0].value) for cell in sheet["C2":"C%d" % n_row]]
resnet_data = [float(cell[0].value) for cell in sheet["D2":"D%d" % n_row]]
gpt_data = [float(cell[0].value) for cell in sheet["E43":"E%d" % n_row]]
if n_days > 0:
date = date[-n_days:]
reid_data = reid_data[-n_days:]
bert_data = bert_data[-n_days:]
resnet_data = resnet_data[-n_days:]
gpt_data = gpt_data[-n_days:]
return log, date, reid_data, bert_data, resnet_data, gpt_data
def draw_figure(x_data, y_data, labels, title, out, height=24, width=8, tick_space=2):
print("Generating figure to: %s" % out)
plt.figure(figsize=(height, width))
for y, label in zip(y_data, labels):
x = x_data[-len(y):]
n_data = len(x)
assert len(x) == len(
y), "assume len(x) == len(y), while %d != %d" % (len(x), len(y))
plt.plot(x, y, linewidth=2, marker='o', markersize=5, label=label)
ax = plt.gca()
ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_space))
for i in range(n_data):
if i % 2 == 0:
plt.text(x[i], y[i], y[i], ha='center',
va='bottom', fontsize=8)
plt.title(title)
plt.xlabel("Date")
plt.ylabel("Time(s)")
plt.grid()
plt.legend()
plt.savefig(out)
def generate_report(log, labels, log_path):
for label in labels:
fname = log[label]["min_file"]
fname_path = os.path.join(log_path, fname)
out_path = os.path.join(log_path, "reports", label+"_me.log")
print("Generating report to: %s" % out_path)
os.system("grep -A 230 'TotalTime = ' %s > %s" %
(fname_path, out_path))
def process_data():
log_path, log_data, me_report, n_days = parse_arguments()
log, date, reid_data, bert_data, resnet_data, gpt_data = read_data(
log_data, me_report, n_days)
draw_figure(date,
[reid_data, bert_data, gpt_data],
["ReID", "BERT", "GPT"],
"ReID&BERT&GPT",
os.path.join(log_path, "reports", "reid_bert_gpt.png")
)
draw_figure(date, [resnet_data], ["ResNet"], "ResNet",
os.path.join(log_path, "reports", "resnet.png"))
generate_report(log, list(log.keys()), log_path)
if __name__ == "__main__":
process_data()

View File

@ -0,0 +1,146 @@
#!/bin/bash
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
stage=0
days=7
iter=5
device_id=0
n_worker=128
work_dir="/opt/npu/me_monitor"
me_report_path=$work_dir/logs/ME_report_daily.xlsx
log_dir=logs_$(date "+%m%d-%H%M")
log_path=$work_dir/logs/$log_dir
ms_master="https://gitee.com/mindspore/mindspore.git"
log_data="data.json"
ci_mode=true
set -e
set -o pipefail
# parse arguments from command line
while getopts "s:d:i:l:" args
do
case $args in
s)
stage=$OPTARG
;;
d)
days=$OPTARG
;;
i)
iter=$OPTARG
;;
l)
log_dir=$OPTARG
log_path=$work_dir/logs/$log_dir
;;
?)
echo "unknown argument"
exit 1
;;
esac
done
source env.sh
export DEVICE_ID=$device_id
echo "Args: days=$days, iter=$iter, log_path=$log_path"
cd $work_dir
echo $WORKSPACE
WORKSPACE=/home/jenkins-slave/workspace/MindSpore_Network_reid_compile_performance
echo $WORKSPACE
if [ $stage -le 1 ]; then
echo ""
echo "===========Stage 1: Fetching latest mindspore from master==========="
if [ -d mindspore ]; then
rm -rf mindspore
fi
git clone $ms_master
fi
if [ $stage -le 2 ]; then
echo ""
echo "===========Stage 2: Building mindspore==========="
cd $work_dir/mindspore
bash build.sh -e ascend -j $n_worker -p on
fi
if [ $stage -le 3 ]; then
echo ""
echo "===========Stage 3: Compiling networks==========="
cd $work_dir
mkdir -p $log_path
# Compiling ReID-8
# split resource-consuming task from others
for count in $(seq 1 $iter); do
echo "[INFO] Compiling ReID-8p, iteration $count"
if [ -d reid$count ]; then
rm -rf reid$count
fi
mkdir reid$count
cd reid$count
bash $work_dir/faceReidToMe/dist_env/env_26/dist_env_26.sh
for num in {0..7}; do
cp device_$num/test_reid_stage123_1024node_graphdata_dynamiclossscale_log$num.log $log_path/reid_${count}_${num}.log
done
cd $work_dir
mv reid$count $log_path
done
# Compiling BERT
cd $work_dir
for count in $(seq 1 $iter); do
echo "[INFO] Compiling BERT, iteration $count"
pytest -s mindspore/tests/perf_test/bert/test_bert_train.py::test_bert_train | tee $log_path/bert$count.log
done
# Compiling ResNet50
for count in $(seq 1 $iter); do
echo "[INFO] Compiling ResNet50, iteration $count"
pytest -s mindspore/tests/perf_test/test_resnet_train.py::test_train_step | tee $log_path/resnet$count.log
done
# Compiling GPT
for count in $(seq 1 $iter); do
echo "[INFO] Compiling GPT, iteration $count"
cd gpt
bash scripts/run_standalone_train.sh 0 1 $work_dir/gpt_data | tee $log_path/gpt$count.log
done
fi
if [ $stage -le 4 ]; then
echo ""
echo "===========Stage 4: Processing log files==========="
cd $work_dir
python process_data.py $me_report_path $log_path $iter $log_path/$log_data
fi
if [ $stage -le 5 ]; then
echo ""
echo "===========Stage 5: Generating reports==========="
if [ ! -d $log_path/reports ]; then
mkdir $log_path/reports
fi
python generate_report.py $log_path $log_path/$log_data $me_report_path $days
if [ $ci_mode ]; then
echo "copying file to artifacts"
mkdir -p ${WORKSPACE}/archive
cp $log_path/reports/* ${WORKSPACE}/archive
fi
fi