#!/usr/bin/env python # Align all .wav files in JW* directories for which a transcript .txt # file exists. import os, sys from glob import glob import subprocess from datetime import datetime from dateutil.tz import tzlocal import re p =re.compile('(...)_.') # a pattern for parsing names like 074_2 tstamp = datetime.now(tzlocal()).replace(microsecond=0).isoformat().replace(":","") # Map generic transcript files. transcripts = {} for t in glob('./transcripts/*.txt'): idnum = t.replace('./transcripts/tp', '').replace('.txt', '') transcripts[idnum] = t with open('todo_' + tstamp + '.txt', 'w') as todo: for jwdir in glob('./JW*'): for wav in glob(jwdir + '/tp*.wav'): idnum = wav.replace(jwdir + '/tp', '').replace('.wav', '') m = p.search(idnum) # check for _ if (m != None): idnum = m.group(1) # put first part in idnum tg = wav.replace('.wav', '.TextGrid') print(tg) if os.path.isfile(tg): continue transcript = wav.replace('.wav', '.txt') if not os.path.isfile(transcript): try: transcript = transcripts[idnum] # Default to generic transcript except KeyError: continue # no transcript file try: #args = ['pyalign', wav, transcript, tg] args = ['make_text_grids', wav, transcript, tg] subprocess.check_call(args) todo.write(tg + '\n') except KeyError: continue # No transcript file except ValueError as e: sys.stderr.write( 'Alignment failed for wav {:} and transcript {:}'.format( wav, transcript ) ) sys.stderr.write(str(e))