230 lines
6.2 KiB
Bash
Executable File
230 lines
6.2 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
set +x
|
|
|
|
print_usage()
|
|
{
|
|
echo -e "Usage:"
|
|
echo -e " ./job-summary.sh [options]."
|
|
echo -e "OPTIONS:"
|
|
echo -e " -h \t Display this help message."
|
|
echo -e " --job \t Set the name of job."
|
|
echo -e " --log \t Set the path of logs file."
|
|
echo -e " --laucher \t Set the path of job laucher desctiption file."
|
|
echo -e "EXAMPLE:"
|
|
echo -e " ./job-summary.sh --job <JOB_NAME>"
|
|
echo -e " ./job-summary.sh --log <PATH_TO_JOB_LOG> --laucher <PATH_TO_LAUCHER_DESCRIPTION>"
|
|
}
|
|
|
|
training_GPUs()
|
|
{
|
|
gpus=$(egrep "^1\simages/sec" "${log_path}" | wc -l)
|
|
echo "${gpus}"
|
|
}
|
|
|
|
training_steps()
|
|
{
|
|
total_steps=$(tail -1000 "${log_path}" | grep "images/sec" | tail -1 | awk '{print $1}')
|
|
echo "${total_steps}"
|
|
}
|
|
|
|
training_accuracy()
|
|
{
|
|
accuracy=$(tail -1000 "${log_path}" | grep "Accuracy @ 5" | tail -1 | awk '{print $10}')
|
|
echo "${accuracy}"
|
|
}
|
|
|
|
training_speed_at_step()
|
|
{
|
|
local step=${1}
|
|
speed=$(egrep "^${step}\simages/sec" "${log_path}" | \
|
|
awk '{cnt+=1; total_speed+=$3}; END {avg_speed=(total_speed/(cnt+0.00001)); print total_speed,avg_speed}')
|
|
echo "${speed}"
|
|
}
|
|
|
|
training_average_speed_until_steps()
|
|
{
|
|
steps="${1}"
|
|
avg_speed=$(grep "images/sec" "${log_path}" | \
|
|
awk '$1<=steps {n+=1; total+=$3}; END{avg_gpu=total/(n+0.000001);avg_step=total*gpus/(n+0.000001); print avg_step,avg_gpu,n}' gpus=${gpus} steps=${steps})
|
|
echo "${avg_speed}"
|
|
}
|
|
|
|
training_average_speed_from_to_steps()
|
|
{
|
|
lower="${1}"
|
|
upper="${2}"
|
|
avg_speed=$(grep "images/sec" "${log_path}" | \
|
|
awk '$1<=upper&&$1>=lower {n+=1; total+=$3}; END{avg_gpu=total/(n+0.000001);avg_step=total*gpus/(n+0.000001); print avg_step,avg_gpu,n}' gpus=${gpus} steps=${steps} lower=${lower} upper=${upper})
|
|
echo "${avg_speed}"
|
|
}
|
|
|
|
training_average_speed_of_all_steps()
|
|
{
|
|
avg_speed=$(grep "images/sec" "${log_path}" | \
|
|
awk '{n+=1; total+=$3}; END{avg_gpu=total/(n+0.000001);avg_step=total*gpus/(n+0.000001); print avg_step,avg_gpu,n}' gpus=${gpus})
|
|
echo "${avg_speed}"
|
|
}
|
|
|
|
training_average_speed()
|
|
{
|
|
local sum=0
|
|
local tmp_log="/tmp/${job_name}_$(date '+%Y%m%d%H%M')"
|
|
arena logs ${job_name} > ${tmp_log}
|
|
# average speed
|
|
for i in $(seq 0 10 2000); do
|
|
sum=$(egrep "^${i}\simages/sec" ${tmp_log} | awk '{cnt+=1; step_sum+=$3}; END {sum+=(step_sum/(cnt+0.00001)) ;print int(sum)}' sum=${sum})
|
|
if [ ${i} -eq 100 ] || [ ${i} -eq 500 ] || [ ${i} -eq 1000 ] || [ ${i} -eq 2000 ]; then
|
|
avg=$((sum * 10 / i))
|
|
echo -e "Top ${i}:\t${avg} images/sec"
|
|
fi
|
|
done
|
|
}
|
|
|
|
get_pod_timestamp()
|
|
{
|
|
local pod_path=$1
|
|
local status=$2
|
|
local tf=$(egrep "^Containers" -A20 ${pod_path} \
|
|
| grep ${status} \
|
|
| awk '{print $2,$3,$4,$5,$6,$7}' \
|
|
| xargs -I {} date "+%s" -d {})
|
|
echo ${tf}
|
|
}
|
|
|
|
compute_pod_lifetime()
|
|
{
|
|
local pod_path=$1
|
|
local started=$(get_pod_timestamp ${pod_path} "Started")
|
|
local finished=$(get_pod_timestamp ${pod_path} "Finished")
|
|
echo $((${finished} - ${started}))
|
|
}
|
|
|
|
training_time()
|
|
{
|
|
# get laucher
|
|
local seconds=$(compute_pod_lifetime ${laucher_path})
|
|
local duration="0s"
|
|
if [ ${seconds} -lt 60 ]; then
|
|
duration="${seconds}s"
|
|
elif [ ${seconds} -lt 3600 ]; then
|
|
duration="$((${seconds}/60))m$((${seconds}%60))s"
|
|
else
|
|
duration="$((${seconds}/3600))h$(((${seconds}%3600)/60))m$((${seconds}%60))s"
|
|
fi
|
|
echo "${duration}"
|
|
}
|
|
|
|
summary()
|
|
{
|
|
gpus=$(training_GPUs)
|
|
total_steps=$(training_steps)
|
|
startup_step=1000
|
|
|
|
# average speed of 25%
|
|
step_25=$((total_steps / 10 / 4 * 10))
|
|
avg_speed_25=$(training_average_speed_until_steps ${step_25})
|
|
|
|
# average speed of 50%
|
|
step_50=$((total_steps / 10 / 4 * 10 * 2))
|
|
avg_speed_50=$(training_average_speed_until_steps ${step_50})
|
|
|
|
# average speed of 75%
|
|
step_75=$((total_steps / 10 / 4 * 10 * 3))
|
|
avg_speed_75=$(training_average_speed_until_steps ${step_75})
|
|
|
|
# 100%
|
|
avg_speed_100=$(training_average_speed_of_all_steps)
|
|
|
|
########## test
|
|
avg_speed_25_to_50=$(training_average_speed_from_to_steps ${step_25} ${step_50})
|
|
avg_speed_50_to_75=$(training_average_speed_from_to_steps ${step_50} ${step_75})
|
|
avg_speed_75_to_100=$(training_average_speed_from_to_steps ${step_75} ${total_steps})
|
|
|
|
echo "==============SUMMARY=================="
|
|
echo -e "Name: \t\t ${job_name}"
|
|
echo -e "Duration: \t\t $(training_time)"
|
|
echo -e "GPUs: \t\t ${gpus}"
|
|
echo -e "Steps: \t\t $(training_steps)"
|
|
echo -e "Accuracy@5: \t\t $(training_accuracy)"
|
|
echo -e "Speed of \t\t\t Step \t GPU \t cnt"
|
|
echo -e "Speed@${startup_step}: \t\t\t $(training_speed_at_step ${startup_step})"
|
|
echo -e "Speed@${total_steps}: \t\t\t $(training_speed_at_step ${total_steps})"
|
|
echo -e "Average Speed 25%(${step_25}): \t ${avg_speed_25}"
|
|
echo -e "Average Speed 50%(${step_50}): \t ${avg_speed_50}"
|
|
echo -e "Average Speed 75%(${step_75}): \t ${avg_speed_75}"
|
|
echo -e "Average Speed 100%(${total_steps}): \t ${avg_speed_100}"
|
|
echo -e "Average Speed 0% to 25%: \t ${avg_speed_25}"
|
|
echo -e "Average Speed 25% to 50%: \t ${avg_speed_25_to_50}"
|
|
echo -e "Average Speed 50% to 75%: \t ${avg_speed_50_to_75}"
|
|
echo -e "Average Speed 75% to 100%: \t ${avg_speed_75_to_100}"
|
|
echo "================END===================="
|
|
}
|
|
|
|
ensure_params()
|
|
{
|
|
if [[ -z "${log_path}" ]]; then
|
|
log_path="/tmp/${job_name}_$(date '+%Y%m%d%H%M').log"
|
|
arena logs ${job_name} &>"${log_path}" 2>&1
|
|
fi
|
|
|
|
if [[ -z "${laucher_path}" ]]; then
|
|
local job_laucher=$(arena get ${job_name} \
|
|
| grep "launcher" | awk '{print $5}')
|
|
laucher_path="/tmp/${job_laucher}_$(date '+%Y%m%d%H%M').log"
|
|
kubectl describe po ${job_laucher} &>"${laucher_path}" 2>&1
|
|
fi
|
|
}
|
|
|
|
main()
|
|
{
|
|
# Parse arguments using getopt
|
|
ARGS=$(getopt -a -o h --long help,job:,,log:,laucher: -- "$@")
|
|
if [ $? != 0 ]; then
|
|
exit 1
|
|
fi
|
|
|
|
eval set -- "${ARGS}"
|
|
|
|
while true
|
|
do
|
|
case "$1" in
|
|
-h|--help)
|
|
print_usage
|
|
shift 1
|
|
exit 0
|
|
;;
|
|
--job)
|
|
job_name=$2
|
|
shift 2
|
|
;;
|
|
--log)
|
|
log_path=$2
|
|
shift 2
|
|
;;
|
|
--laucher)
|
|
laucher_path=$2
|
|
shift 2
|
|
;;
|
|
--)
|
|
shift
|
|
break
|
|
;;
|
|
*)
|
|
echo "ERROR: invalide argument $1" >&2
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ -z "${job_name}" && -z "${log_path}" && -z "${laucher_path}" ]]; then
|
|
echo "ERROR: invalide aruguments" >&2
|
|
print_usage
|
|
exit 1
|
|
fi
|
|
|
|
ensure_params
|
|
summary
|
|
}
|
|
|
|
main "$@" |