add me daily monitor files
This commit is contained in:
parent
2e9a52fc5c
commit
7e3afac1c0
|
@ -0,0 +1,143 @@
|
||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import openpyxl as opx
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
print(sys.argv)
|
||||||
|
me_report_path = sys.argv[1]
|
||||||
|
log_path = sys.argv[2]
|
||||||
|
n_iter = sys.argv[3]
|
||||||
|
out = sys.argv[4]
|
||||||
|
assert n_iter.isdigit()
|
||||||
|
return me_report_path, log_path, int(n_iter), out
|
||||||
|
|
||||||
|
|
||||||
|
def extract_by_keyword(doc, keyword, pattern):
|
||||||
|
rst = []
|
||||||
|
for i, s in enumerate(doc):
|
||||||
|
if keyword in s:
|
||||||
|
p = re.findall(pattern, s)
|
||||||
|
print("L%d: extracted %s from '%s'" % (i, p, s.strip()))
|
||||||
|
rst.extend(p)
|
||||||
|
return rst
|
||||||
|
|
||||||
|
|
||||||
|
def process_log(fname, log_path, n_iter, keyword, pattern):
|
||||||
|
rnt = {}
|
||||||
|
for i in range(1, 1+n_iter):
|
||||||
|
fname_path = os.path.join(log_path, fname % i)
|
||||||
|
with open(fname_path) as f:
|
||||||
|
print("\nLoading %s" % fname_path)
|
||||||
|
rst = extract_by_keyword(f, keyword, pattern)
|
||||||
|
rnt[fname % i] = rst
|
||||||
|
return rnt
|
||||||
|
|
||||||
|
|
||||||
|
def summarize(func):
|
||||||
|
def wrapper(*args, **kwargs):
|
||||||
|
log = func(*args, **kwargs)
|
||||||
|
times = list(log.items())
|
||||||
|
times.sort(key=lambda x: x[1])
|
||||||
|
min_file, min_time = times[0]
|
||||||
|
avg = sum(map(lambda x: x[1], times)) / len(times)
|
||||||
|
log["min_time"] = min_time
|
||||||
|
log["min_file"] = min_file
|
||||||
|
log["avg_time"] = avg
|
||||||
|
return log
|
||||||
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
@summarize
|
||||||
|
def process_bert_log(log_path, n_iter):
|
||||||
|
fname = "bert%d.log"
|
||||||
|
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
|
||||||
|
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
|
||||||
|
log = {}
|
||||||
|
for fname in total:
|
||||||
|
log[fname] = float(total[fname][0]) - float(task[fname][0])
|
||||||
|
return log
|
||||||
|
|
||||||
|
|
||||||
|
@summarize
|
||||||
|
def process_resnet_log(log_path, n_iter):
|
||||||
|
fname = "resnet%d.log"
|
||||||
|
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
|
||||||
|
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
|
||||||
|
log = {}
|
||||||
|
for fname in total:
|
||||||
|
log[fname] = float(total[fname][0]) - float(task[fname][0])
|
||||||
|
return log
|
||||||
|
|
||||||
|
|
||||||
|
@summarize
|
||||||
|
def process_gpt_log(log_path, n_iter):
|
||||||
|
fname = "gpt%d.log"
|
||||||
|
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
|
||||||
|
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
|
||||||
|
log = {}
|
||||||
|
for fname in total:
|
||||||
|
log[fname] = float(total[fname][0]) - float(task[fname][0])
|
||||||
|
return log
|
||||||
|
|
||||||
|
|
||||||
|
@summarize
|
||||||
|
def process_reid_log(log_path, n_iter):
|
||||||
|
log = {}
|
||||||
|
for i in range(8):
|
||||||
|
fname = "reid_%d_"+str(i)+".log"
|
||||||
|
total = process_log(fname, log_path, n_iter, "TotalTime", r"\d+.\d+")
|
||||||
|
task = process_log(fname, log_path, n_iter, "task_emit", r"\d+.\d+")
|
||||||
|
for fname in total:
|
||||||
|
log[fname] = float(total[fname][0]) - float(task[fname][0])
|
||||||
|
return log
|
||||||
|
|
||||||
|
|
||||||
|
def write_to_me_report(log, me_report_path):
|
||||||
|
wb = opx.load_workbook(me_report_path)
|
||||||
|
sheet = wb["Sheet"]
|
||||||
|
idx = sheet.max_row + 1
|
||||||
|
date = time.strftime('%m%d', time.localtime())
|
||||||
|
sheet['A%d' % idx] = date
|
||||||
|
sheet['B%d' % idx] = round(log["reid"]["min_time"], 2)
|
||||||
|
sheet['C%d' % idx] = round(log["bert"]["min_time"], 2)
|
||||||
|
sheet['D%d' % idx] = round(log['resnet']["min_time"], 2)
|
||||||
|
sheet['E%d' % idx] = round(log['gpt']["min_time"], 2)
|
||||||
|
wb.save(me_report_path)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_report():
|
||||||
|
me_report_path, log_path, n_iter, out = parse_arguments()
|
||||||
|
log_data = {}
|
||||||
|
bert_log = process_bert_log(log_path, n_iter)
|
||||||
|
resnet_log = process_resnet_log(log_path, n_iter)
|
||||||
|
gpt_log = process_gpt_log(log_path, n_iter)
|
||||||
|
reid_log = process_reid_log(log_path, n_iter)
|
||||||
|
log_data["bert"] = bert_log
|
||||||
|
log_data["resnet"] = resnet_log
|
||||||
|
log_data["gpt"] = gpt_log
|
||||||
|
log_data["reid"] = reid_log
|
||||||
|
with open(out, "w") as f:
|
||||||
|
json.dump(log_data, f, indent=2)
|
||||||
|
write_to_me_report(log_data, me_report_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
generate_report()
|
|
@ -0,0 +1,104 @@
|
||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import openpyxl as opx
|
||||||
|
import matplotlib.ticker as ticker
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
def parse_arguments():
|
||||||
|
log_path = sys.argv[1]
|
||||||
|
log_data = sys.argv[2]
|
||||||
|
me_report = sys.argv[3]
|
||||||
|
n_days = sys.argv[4]
|
||||||
|
assert n_days.isdigit()
|
||||||
|
return log_path, log_data, me_report, int(n_days)
|
||||||
|
|
||||||
|
|
||||||
|
def read_data(log_data, me_report_path, n_days):
|
||||||
|
with open(log_data) as f:
|
||||||
|
log = json.load(f)
|
||||||
|
|
||||||
|
wb = opx.load_workbook(me_report_path)
|
||||||
|
sheet = wb["Sheet"]
|
||||||
|
n_row = sheet.max_row
|
||||||
|
date = [cell[0].value for cell in sheet["A2":"A%d" % n_row]]
|
||||||
|
reid_data = [float(cell[0].value) for cell in sheet["B2":"B%d" % n_row]]
|
||||||
|
bert_data = [float(cell[0].value) for cell in sheet["C2":"C%d" % n_row]]
|
||||||
|
resnet_data = [float(cell[0].value) for cell in sheet["D2":"D%d" % n_row]]
|
||||||
|
gpt_data = [float(cell[0].value) for cell in sheet["E43":"E%d" % n_row]]
|
||||||
|
if n_days > 0:
|
||||||
|
date = date[-n_days:]
|
||||||
|
reid_data = reid_data[-n_days:]
|
||||||
|
bert_data = bert_data[-n_days:]
|
||||||
|
resnet_data = resnet_data[-n_days:]
|
||||||
|
gpt_data = gpt_data[-n_days:]
|
||||||
|
|
||||||
|
return log, date, reid_data, bert_data, resnet_data, gpt_data
|
||||||
|
|
||||||
|
|
||||||
|
def draw_figure(x_data, y_data, labels, title, out, height=24, width=8, tick_space=2):
|
||||||
|
print("Generating figure to: %s" % out)
|
||||||
|
plt.figure(figsize=(height, width))
|
||||||
|
for y, label in zip(y_data, labels):
|
||||||
|
x = x_data[-len(y):]
|
||||||
|
n_data = len(x)
|
||||||
|
assert len(x) == len(
|
||||||
|
y), "assume len(x) == len(y), while %d != %d" % (len(x), len(y))
|
||||||
|
plt.plot(x, y, linewidth=2, marker='o', markersize=5, label=label)
|
||||||
|
ax = plt.gca()
|
||||||
|
ax.xaxis.set_major_locator(ticker.MultipleLocator(tick_space))
|
||||||
|
for i in range(n_data):
|
||||||
|
if i % 2 == 0:
|
||||||
|
plt.text(x[i], y[i], y[i], ha='center',
|
||||||
|
va='bottom', fontsize=8)
|
||||||
|
|
||||||
|
plt.title(title)
|
||||||
|
plt.xlabel("Date")
|
||||||
|
plt.ylabel("Time(s)")
|
||||||
|
plt.grid()
|
||||||
|
plt.legend()
|
||||||
|
plt.savefig(out)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_report(log, labels, log_path):
|
||||||
|
for label in labels:
|
||||||
|
fname = log[label]["min_file"]
|
||||||
|
fname_path = os.path.join(log_path, fname)
|
||||||
|
out_path = os.path.join(log_path, "reports", label+"_me.log")
|
||||||
|
print("Generating report to: %s" % out_path)
|
||||||
|
os.system("grep -A 230 'TotalTime = ' %s > %s" %
|
||||||
|
(fname_path, out_path))
|
||||||
|
|
||||||
|
|
||||||
|
def process_data():
|
||||||
|
log_path, log_data, me_report, n_days = parse_arguments()
|
||||||
|
log, date, reid_data, bert_data, resnet_data, gpt_data = read_data(
|
||||||
|
log_data, me_report, n_days)
|
||||||
|
draw_figure(date,
|
||||||
|
[reid_data, bert_data, gpt_data],
|
||||||
|
["ReID", "BERT", "GPT"],
|
||||||
|
"ReID&BERT&GPT",
|
||||||
|
os.path.join(log_path, "reports", "reid_bert_gpt.png")
|
||||||
|
)
|
||||||
|
draw_figure(date, [resnet_data], ["ResNet"], "ResNet",
|
||||||
|
os.path.join(log_path, "reports", "resnet.png"))
|
||||||
|
generate_report(log, list(log.keys()), log_path)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
process_data()
|
|
@ -0,0 +1,146 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
stage=0
|
||||||
|
days=7
|
||||||
|
iter=5
|
||||||
|
device_id=0
|
||||||
|
n_worker=128
|
||||||
|
work_dir="/opt/npu/me_monitor"
|
||||||
|
me_report_path=$work_dir/logs/ME_report_daily.xlsx
|
||||||
|
log_dir=logs_$(date "+%m%d-%H%M")
|
||||||
|
log_path=$work_dir/logs/$log_dir
|
||||||
|
ms_master="https://gitee.com/mindspore/mindspore.git"
|
||||||
|
log_data="data.json"
|
||||||
|
ci_mode=true
|
||||||
|
|
||||||
|
set -e
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
# parse arguments from command line
|
||||||
|
while getopts "s:d:i:l:" args
|
||||||
|
do
|
||||||
|
case $args in
|
||||||
|
s)
|
||||||
|
stage=$OPTARG
|
||||||
|
;;
|
||||||
|
d)
|
||||||
|
days=$OPTARG
|
||||||
|
;;
|
||||||
|
i)
|
||||||
|
iter=$OPTARG
|
||||||
|
;;
|
||||||
|
l)
|
||||||
|
log_dir=$OPTARG
|
||||||
|
log_path=$work_dir/logs/$log_dir
|
||||||
|
;;
|
||||||
|
?)
|
||||||
|
echo "unknown argument"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
source env.sh
|
||||||
|
export DEVICE_ID=$device_id
|
||||||
|
echo "Args: days=$days, iter=$iter, log_path=$log_path"
|
||||||
|
cd $work_dir
|
||||||
|
|
||||||
|
echo $WORKSPACE
|
||||||
|
WORKSPACE=/home/jenkins-slave/workspace/MindSpore_Network_reid_compile_performance
|
||||||
|
echo $WORKSPACE
|
||||||
|
|
||||||
|
if [ $stage -le 1 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "===========Stage 1: Fetching latest mindspore from master==========="
|
||||||
|
if [ -d mindspore ]; then
|
||||||
|
rm -rf mindspore
|
||||||
|
fi
|
||||||
|
git clone $ms_master
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 2 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "===========Stage 2: Building mindspore==========="
|
||||||
|
cd $work_dir/mindspore
|
||||||
|
bash build.sh -e ascend -j $n_worker -p on
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 3 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "===========Stage 3: Compiling networks==========="
|
||||||
|
cd $work_dir
|
||||||
|
mkdir -p $log_path
|
||||||
|
|
||||||
|
# Compiling ReID-8
|
||||||
|
# split resource-consuming task from others
|
||||||
|
for count in $(seq 1 $iter); do
|
||||||
|
echo "[INFO] Compiling ReID-8p, iteration $count"
|
||||||
|
if [ -d reid$count ]; then
|
||||||
|
rm -rf reid$count
|
||||||
|
fi
|
||||||
|
mkdir reid$count
|
||||||
|
cd reid$count
|
||||||
|
bash $work_dir/faceReidToMe/dist_env/env_26/dist_env_26.sh
|
||||||
|
for num in {0..7}; do
|
||||||
|
cp device_$num/test_reid_stage123_1024node_graphdata_dynamiclossscale_log$num.log $log_path/reid_${count}_${num}.log
|
||||||
|
done
|
||||||
|
cd $work_dir
|
||||||
|
mv reid$count $log_path
|
||||||
|
done
|
||||||
|
|
||||||
|
# Compiling BERT
|
||||||
|
cd $work_dir
|
||||||
|
for count in $(seq 1 $iter); do
|
||||||
|
echo "[INFO] Compiling BERT, iteration $count"
|
||||||
|
pytest -s mindspore/tests/perf_test/bert/test_bert_train.py::test_bert_train | tee $log_path/bert$count.log
|
||||||
|
done
|
||||||
|
|
||||||
|
# Compiling ResNet50
|
||||||
|
for count in $(seq 1 $iter); do
|
||||||
|
echo "[INFO] Compiling ResNet50, iteration $count"
|
||||||
|
pytest -s mindspore/tests/perf_test/test_resnet_train.py::test_train_step | tee $log_path/resnet$count.log
|
||||||
|
done
|
||||||
|
|
||||||
|
# Compiling GPT
|
||||||
|
for count in $(seq 1 $iter); do
|
||||||
|
echo "[INFO] Compiling GPT, iteration $count"
|
||||||
|
cd gpt
|
||||||
|
bash scripts/run_standalone_train.sh 0 1 $work_dir/gpt_data | tee $log_path/gpt$count.log
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 4 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "===========Stage 4: Processing log files==========="
|
||||||
|
cd $work_dir
|
||||||
|
python process_data.py $me_report_path $log_path $iter $log_path/$log_data
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $stage -le 5 ]; then
|
||||||
|
echo ""
|
||||||
|
echo "===========Stage 5: Generating reports==========="
|
||||||
|
if [ ! -d $log_path/reports ]; then
|
||||||
|
mkdir $log_path/reports
|
||||||
|
fi
|
||||||
|
python generate_report.py $log_path $log_path/$log_data $me_report_path $days
|
||||||
|
|
||||||
|
if [ $ci_mode ]; then
|
||||||
|
echo "copying file to artifacts"
|
||||||
|
mkdir -p ${WORKSPACE}/archive
|
||||||
|
cp $log_path/reports/* ${WORKSPACE}/archive
|
||||||
|
fi
|
||||||
|
fi
|
Loading…
Reference in New Issue