normalize_and_filter_text.py 1.95 KB
Newer Older
Nikhilesh Bhatnagar's avatar
Nikhilesh Bhatnagar committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import fasttext as ft
import os
import regex
import sys


def get_parser():
    parser = argparse.ArgumentParser(
        description="reads text from stdin and outputs normalized, lid-filtered version to stdout"
    )
    parser.add_argument(
        "--fasttext-model",
        help="path to fasttext model",
        default="lid.187.bin",
    )
    parser.add_argument("--lang", help="language id", required=True)
    parser.add_argument(
        "--lid-threshold",
        type=float,
        help="threshold for this lang id probability",
        default=0.4,
    )

    return parser


def main():
    parser = get_parser()
    args = parser.parse_args()
    filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]")

    lg = args.lang.lower()
    lg_label = f"__label__{lg}"
    thresh = args.lid_threshold

    if os.path.exists(args.fasttext_model):
        model = ft.load_model(args.fasttext_model)
    else:
        print(
            f"fasttext language id model {args.fasttext_model} not found. Proceeding without language filtering. "
            f"To enable language filtering, please download the latest language id model "
            f"from https://fasttext.cc/docs/en/language-identification.html",
            file=sys.stderr,
        )
        model = None

    for line in sys.stdin:
        line = line.strip()
        line = filter_r.sub(" ", line)
        line = " ".join(line.split())

        if model is not None:
            lid, prob = model.predict(line, k=100)
            try:
                target_idx = lid.index(lg_label)
            except ValueError:
                continue
            if target_idx == 0 or prob[target_idx] >= thresh:
                print(line)
        else:
            print(line)


if __name__ == "__main__":
    main()