prepare_data_from_w2v.py 2.09 KB
Newer Older
Nikhilesh Bhatnagar's avatar
Nikhilesh Bhatnagar committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
import kaldi_io
import numpy as np
import os


def get_parser():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("w2v_dir", help="wav2vec feature and text directory")
    parser.add_argument("tar_root", help="output data directory in kaldi's format")
    parser.add_argument("split", help="name of the subset")
    parser.add_argument("--label", default="", help="if specified, copy labels too")
    return parser

def main():
    parser = get_parser()
    args = parser.parse_args()

    tar_dir = os.path.join(args.tar_root, args.split)
    os.makedirs(tar_dir, exist_ok=True)

    lengths_path = os.path.join(args.w2v_dir, f"{args.split}.lengths")
    with open(lengths_path) as f:
        lengths = [int(line.rstrip()) for line in f]
        offsets = [0] + np.cumsum(lengths[:-1]).tolist()
    feats = np.load(
        os.path.join(args.w2v_dir, f"{args.split}.npy"),
        mmap_mode="r"
    )
    assert feats.shape[0] == sum(lengths), \
        f"lengths mismatch {feats.shape[0]} != {sum(lengths)}"

    ark_path = os.path.join(tar_dir, "feats.ark")
    scp_path = os.path.join(tar_dir, "feats.scp")
    wspec = f"ark:| copy-feats --compress=true ark:- ark,scp:{ark_path},{scp_path}"
    with kaldi_io.open_or_fd(wspec, "wb") as f:
        for idx, (offset, length) in enumerate(zip(offsets, lengths)):
            feat = feats[offset:offset+length]
            kaldi_io.write_mat(f, feat, key=f"utt{idx:010d}")

    u2s_path = os.path.join(tar_dir, "utt2spk")
    s2u_path = os.path.join(tar_dir, "spk2utt")
    with open(u2s_path, "w") as f_u2s, open(s2u_path, "w") as f_s2u:
        for idx in range(len(lengths)):
            f_u2s.write(f"utt{idx:010d} utt{idx:010d}\n")
            f_s2u.write(f"utt{idx:010d} utt{idx:010d}\n")

    if bool(args.label):
        lab_path = os.path.join(args.w2v_dir, f"{args.split}.{args.label}")
        txt_path = os.path.join(tar_dir, "text")
        with open(lab_path) as f_lab, open(txt_path, "w") as f_txt:
            for idx, line in enumerate(f_lab):
                f_txt.write(f"utt{idx:010d} {line}")

if __name__ == "__main__":
    main()