MindSpore-Model-Development/scripts/gen_benchmark.py

"""
Usage:
    $ python scripts/gen_benchmark.py
"""

import glob
import os

clear_empty_urls = True
rm_columns = []  # ['Infer T.', 'Log']

# get all training recips
recipes = sorted(glob.glob("configs/*/*.yaml"))
print("Total number of training recipes: ", len(recipes))
ar = glob.glob("configs/*/*_ascend.yaml")
print("Ascend training recipes: ", len(ar))
gr = glob.glob("configs/*/*_gpu.yaml")
print("GPU training recipes: ", len(gr))
for item in set(recipes) - set(ar) - set(gr):
    print(item)

models_with_train_rec = []
for r in recipes:
    mn = r.split("/")[-2]
    if mn not in models_with_train_rec:
        models_with_train_rec.append(mn)
models_with_train_rec = sorted(models_with_train_rec)

print("\n==> Models with training recipes: ", len(models_with_train_rec))
print(models_with_train_rec)

# get readme file list
config_dirs = sorted([d for d in os.listdir("./configs") if os.path.isdir("configs/" + d)])
print("\nTotal number of config folders: ", len(config_dirs))
print("==> Configs w/o training rec: ", set(config_dirs) - set(models_with_train_rec))
readmes = [f"configs/{d}/README.md" for d in config_dirs]

for readme in readmes:
    if not os.path.exists(readme):
        print("Missing readme: ", readme)

# check yaml and reported performance

# merge readme reported results
print("\r\n ")
output_path = "./benchmark_results.md"
fout = open(output_path, "w")

kw = ["Model", "Top", "Download", "Config"]

# process table head
head = (
    "|       Model    | Context  | Top-1 (%) | Top-5 (%) | Params(M) "
    "|                                                 Recipe                                                  "
    "|                                    Download                                   |"
)
fout.write(head + "\n")

fout.write(
    "| -------------- | -------- | --------- | --------- | --------- "
    "| ------------------------------------------------------------------------------------------------------- "
    "|  ---------------------------------------------------------------------------- |\n"
)

attrs = head.replace(" ", "")[1:-1].split("|")
print("table attrs: ", attrs)

result_kw = ["Results", "Benchmark", "Result"]  # TODO: unify this name
head_detect_kw = ["Model", "Top"]

# process each model readme
parsed_models = []
parsed_model_specs = []
for r in readmes:
    state = 0
    print("parsing ", r)
    results = []
    with open(r) as fp:
        for line in fp:
            if state == 0:
                for kw in result_kw:
                    if f"##{kw}" in line.strip().replace(" ", ""):
                        state = 1
            # detect head
            elif state == 1:
                if "|Model|Context" in line.replace(" ", ""):
                    if len(line.split("|")) == len(head.split("|")):
                        state = 2
                    else:
                        print("Detect head, but format is incorrect:")
                        # print(line)

            # get table values
            elif state == 2:
                if len(line.split("|")) == len(head.split("|")):
                    # clear empty model
                    if "--" not in line:
                        results.append(line)
                        # print(line)
                        fout.write(line)
                        parsed_model_specs.append(line.split("|")[0])
                else:
                    parsed_models.append(r.split("/")[-2])
                    state = 3

    if state == 0:
        print("Fail to get Results")
    elif state == 1:
        print("Fail to get table head")
    elif state == 2:
        print("Fail to get table values")

print("Parsed models in benchmark: ", len(parsed_models))
print("Parsed model specs in benchmark: ", len(parsed_model_specs))
print("Readme using inconsistent result table format: \r\n", set(config_dirs) - set(parsed_models))

"""
fout.close()
def md_to_pd(md_fp, md_has_col_name=True, save_csv=False):
    # Convert the Markdown table to a list of lists
    with open(md_fp) as f:
        rows = []
        for row in f.readlines():
            if len(row.split('|')) >= 2:
                # Get rid of leading and trailing '|'
                tmp = row[1:-2]

                # Split line and ignore column whitespace
                clean_line = [col.strip() for col in tmp.split('|')]

                # Append clean row data to rows variable
                rows.append(clean_line)

        # Get rid of syntactical sugar to indicate header (2nd row)
        rows = rows[:1] + rows[2:]
    print(rows)
    if md_has_col_name:
        df = pd.DataFrame(data=rows[1:], columns=rows[0])
    else:
        df = pd.DataFrame(rows)

    if save_csv:
        df.to_csv(md_fp.replace('.md', '.csv'), index=False, header=False)
    return df

df = md_to_pd(output_path, save_csv=True)
print(df)

for cn in rm_columns:
    df = df.drop(cn, axis=1)

print(df)

md_doc = df.to_markdown(mode='w', index=False, tablefmt='pipe')

fout = open(output_path, 'w')
fout.write(md_doc)
"""


# write notes
fout.write("\n#### Notes\n")

fout.write(
    "- Context: Training context denoted as {device}x{pieces}-{MS mode}, "
    "where mindspore mode can be G - graph mode or F - pynative mode with ms function. "
    "For example, D910x8-G is for training on 8 pieces of Ascend 910 NPU using graph mode.\n"
    "- Top-1 and Top-5: Accuracy reported on the validation set of ImageNet-1K."
)

fout.close()


print(f"\n ===> Done! Benchmark generated in {output_path}")