From 81ffe5a50f2aaa3cf0d7add4be5efcddfdde4e89 Mon Sep 17 00:00:00 2001 From: Benno Lang Date: Tue, 21 Jan 2025 22:59:56 +1030 Subject: [PATCH] First version, seems to work --- .gitignore | 3 ++ lib/args.py | 72 ++++++++++++++++++++++++++++++ lib/config.py | 19 ++++++++ lib/fetcher.py | 16 +++++++ lib/files.py | 117 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/rss.py | 54 +++++++++++++++++++++++ process | 35 +++++++++++++++ update-config | 64 +++++++++++++++++++++++++++ 8 files changed, 380 insertions(+) create mode 100644 .gitignore create mode 100644 lib/args.py create mode 100644 lib/config.py create mode 100644 lib/fetcher.py create mode 100644 lib/files.py create mode 100644 lib/rss.py create mode 100755 process create mode 100755 update-config diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1198e59 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +podcasts/* +__pycache__ +config.json diff --git a/lib/args.py b/lib/args.py new file mode 100644 index 0000000..87d5614 --- /dev/null +++ b/lib/args.py @@ -0,0 +1,72 @@ +import re +import sys + +isNumber = re.compile('^[0-9]+$') + +class Options: + loadFeed = True + downloadEpisodes = True + generate = True + filter = None + episodes = 'new' + numEpisodes = 1 + +def outputHelp(): + print("Usage: ./process [args] [filter [age] [episodes]]") + print("") + print("args:") + print("--help") + print(" Show this help information") + print("--no-feed, --skip-feed") + print(" Skip the initial step of downloading the RSS feeds") + print("--no-download, --skip-download") + print(" Skip downloading podcast episodes") + print("--no-gen, --skip-gen") + print(" Skip generating transcripts and translations of podcast episodes") + print("") + print("filter:") + print(" If specified, only configured podcasts which match the filter will be processed") + print(" A filter of 'pieds sur' would match 'Les Pieds sur terre'") + print("") + print("age:") + print(" 'new' (default), or 'old'") + print(" Fetch the newest or oldest episode(s) for the matching podcast(s)") + print("") + print("episodes:") + print(" 1 by default") + print(" The number of episode(s) to fetch for the matching podcast(s)") + print("") + print("Examples:") + print(" ./process --help") + print(" ./process --no-feed --skip-download") + print(" Only process previously downloaded episodes (useful e.g. if Whisper ran out of memory,") + print(" or a podcast wasn't configured to translate episodes)") + print(" ./process 'the few' old 3") + print(" Download and process the first 3 episodes of 'The Few Who Do'") + +def read(): + opts = Options() + for arg in sys.argv[1:]: + if arg == 'help' or arg =='--help': + outputHelp() + sys.exit(0) + elif arg == '--no-feed' or arg == '--skip-feed': + opts.loadFeed = False + elif arg == '--no-download' or arg == '--skip-download': + opts.downloadEpisodes = False + elif arg == "--no-gen" or arg == "--skip-gen": + opts.generate = False + elif arg == 'new': + opts.episodes = 'new' + elif arg == 'old': + opts.episodes = 'old' + elif isNumber.match(arg): + opts.numEpisodes = int(arg) + elif opts.filter == None: + opts.filter = arg.lower() + else: + print(f"Unrecognised argument: {arg}") + print("") + outputHelp() + sys.exit(1) + return opts diff --git a/lib/config.py b/lib/config.py new file mode 100644 index 0000000..f0a5ceb --- /dev/null +++ b/lib/config.py @@ -0,0 +1,19 @@ +import json + +def load(): + try: + configFile = open('config.json', 'r', encoding='utf-8') + config = json.load(configFile) + configFile.close() + except: + config = { + 'translate': [], + 'podcasts': [], + } + return config + +def save(configData): + configFile = open('config.json', 'w', encoding='utf-8') + json.dump(configData, configFile, indent=4) + configFile.write('\n') + configFile.close() diff --git a/lib/fetcher.py b/lib/fetcher.py new file mode 100644 index 0000000..86f4a85 --- /dev/null +++ b/lib/fetcher.py @@ -0,0 +1,16 @@ +import urllib.request + +urllib.request.build_opener() + +# Simple wrapper to spoof User-Agent when fetching remote files +def download(remoteUrl, localFilename): + localFile = open(localFilename, 'b+w') + opener = urllib.request.build_opener() + opener.addheaders = [('User-agent', 'Podkastomat/0.1.0')] + remoteFile = opener.open(remoteUrl) + localFile.write(remoteFile.read()) + + + + + diff --git a/lib/files.py b/lib/files.py new file mode 100644 index 0000000..9d34a8a --- /dev/null +++ b/lib/files.py @@ -0,0 +1,117 @@ +import glob +import re +import lib.fetcher as fetcher +import os +import os.path + +from dateutil import parser as dateparser + +charsToRemove = re.compile('[^\w\-]*') +multipleUnderscores = re.compile('__+') +startingNumerals = re.compile(r'^#?([0-9]+)') + +def addRef(podcast): + podcast['ref'] = getRef(podcast['name']) + podcast['dir'] = f"podcasts/{podcast['lang']}/{podcast['ref']}" + +def getRef(name): + ref = name.lower() + ref = ref.replace(' ', '_') + ref = charsToRemove.sub('', ref) + ref = multipleUnderscores.sub('_', ref) + return ref + +def getExtension(fileName): + urlAndParams = fileName.partition('?') + parts = urlAndParams[0].split('.') + return parts.pop() + +def replaceExtension(fileName, newExtension): + parts = fileName.split('.') + parts.pop() + return '.'.join(parts) + '.' + newExtension + +def saveEpisode(podcast, episode): + match = startingNumerals.match(episode['title']) + if episode['episode']: + # Episode (& possibly season) number encoded in the RSS + epNum = episode['episode'] + openingRef = epNum.zfill(5) + identifier = f"{epNum}" + if episode['season']: + seasonNum = episode['season'] + openingRef = 's' + seasonNum.zfill(3) + 'ep' + openingRef + identifier += f" of season {seasonNum}" + offset = 0 + elif match: + # Episode number deduced from its title + epNum = match.group(1) + openingRef = epNum.zfill(5) + offset = len(match.group(0)) + identifier = epNum + elif "date" in episode: + # Podcast (or part thereof) where episodes only have a name and date + date = dateparser.parse(episode['date']) + openingRef = date.strftime("%Y-%m-%d_%H%M") + offset = 0 + identifier = date.strftime("from %d %b %Y at %H:%M") + else: + print(f"Unable to determine identifier for episode:\n{episode}") + return + filenamePattern = f"{podcast['dir']}/{openingRef}_*" + extantFiles = glob.glob(filenamePattern) + if len(extantFiles) == 0: + # Download new file + ext = getExtension(episode['url']) + episodeName = openingRef + '_' + episode['title'][offset:] + if episode['subtitle']: + episodeName += ' - ' + episode['subtitle'] + episodeRef = getRef(episodeName) + fileName = f"{episodeRef}.{ext}" + print(f"Downloading episode {identifier} of {podcast['name']} as {fileName}") + fetcher.download(episode['url'], f"{podcast['dir']}/{fileName}") + else: + print(f"Episode {identifier} of {podcast['name']} has been downloaded previously; skipping") + +def findAudio(): + return glob.glob('podcasts/*/*/*.m[4p][a3]') + +def getLangCode(audioFile): + parts = audioFile.split('/') + return parts[1] + +def getTranscriptFilename(audioFile): + return replaceExtension(audioFile, 'transcript.txt') + +def hasTranscript(audioFile): + transcriptFile = getTranscriptFilename(audioFile) + return os.path.isfile(transcriptFile) + +def getTranslationFilename(audioFile): + return replaceExtension(audioFile, 'translation.txt') + +def hasTranslation(audioFile): + translationFile = getTranslationFilename(audioFile) + return os.path.isfile(translationFile) + +def generateFromAudio(audioFile, task): + if task == 'transcribe': + newExt = 'transcript.txt' + else: + task = 'translate' + newExt = 'translation.txt' + + langCode = getLangCode(audioFile) + fileParts = audioFile.split('/') + fileName = fileParts.pop() + dir = '/'.join(fileParts) + os.chdir(dir) + + cmd = f"whisper {fileName} --model medium --language {langCode} --task {task} --output_format vtt --fp16 False" + os.system(cmd) + + # rename transcript/translation file generated + generatedFile = replaceExtension(fileName, 'vtt') + newFileName = replaceExtension(fileName, newExt) + os.rename(generatedFile, newFileName) + os.chdir('../..') diff --git a/lib/rss.py b/lib/rss.py new file mode 100644 index 0000000..2181310 --- /dev/null +++ b/lib/rss.py @@ -0,0 +1,54 @@ +import lib.fetcher as fetcher +import os.path +from xml.dom import minidom + +namespaces = {'itunes': 'itunes.com'} + +def fetch(podcast, loadFeed): + podcast['feed'] = podcast['dir'] + "/rss.xml" + if not os.path.isdir(podcast['dir']): + os.makedirs(podcast['dir']) + if loadFeed: + print(f"Downloading RSS for podcast {podcast['name']}") + fetcher.download(podcast['url'], podcast['feed']) + +# Get the text value of a child name with a given name, if available +def extractField(node, childNodeName): + children = node.getElementsByTagName(childNodeName) + if children.length == 0: + return '' + return children[0].firstChild.data + +def getEpisodes(podcast, episodesFrom, numEpisodes): + print(f"Getting latest episode from {podcast['name']}") + doc = minidom.parse(podcast['feed']) + root = doc.getElementsByTagName('channel')[0] + epNodes = root.getElementsByTagName('item') + epNodeNum = 0 + totalEpisodes = epNodes.length + if episodesFrom == 'old': + epNodeNum = totalEpisodes - 1 + episodes = [] + for i in range (0, numEpisodes): + epNode = epNodes.item(epNodeNum) + episode = { + 'title': extractField(epNode, 'title'), + 'subtitle': '', + 'url': epNode.getElementsByTagName('enclosure')[0].getAttribute('url'), + 'season': extractField(epNode, 'itunes:season'), + 'episode': extractField(epNode, 'itunes:episode') + } + subtitle = extractField(epNode, 'itunes:subtitle') + if len(episode['title']) + len(subtitle) < 150: + episode['subtitle'] = subtitle + pubDate = extractField(epNode, 'pubDate') + if pubDate: + episode['date'] = pubDate + episodes.append(episode) + if episodesFrom == 'old': + epNodeNum -= 1 + else: + epNodeNum += 1 + if epNodeNum == -1 or epNodeNum == totalEpisodes: + break + return episodes diff --git a/process b/process new file mode 100755 index 0000000..d78a5ac --- /dev/null +++ b/process @@ -0,0 +1,35 @@ +#!/usr/bin/python3 + +import inspect +import lib.args as args +import lib.config as cfg +import lib.files as files +import lib.rss as rss + +config = cfg.load() +options = args.read() + +for podcast in config['podcasts']: + if options.filter and not options.filter in podcast['name'].lower(): + continue + files.addRef(podcast) + rss.fetch(podcast, options.loadFeed) + if options.downloadEpisodes: + eps = rss.getEpisodes(podcast, options.episodes, options.numEpisodes) + for ep in eps: + print(ep) + files.saveEpisode(podcast, ep) + +if not options.generate: + exit(0) +audioFiles = files.findAudio() +for audioFile in audioFiles: + print(f"Audio: {audioFile}") + continue + language = files.getLangCode(audioFile) + if not files.hasTranscript(audioFile): + files.generateFromAudio(audioFile, 'transcribe') + if language in config['translate'] and not files.hasTranslation(audioFile): + files.generateFromAudio(audioFile, 'translate') + + diff --git a/update-config b/update-config new file mode 100755 index 0000000..cef22d5 --- /dev/null +++ b/update-config @@ -0,0 +1,64 @@ +#!/usr/bin/python3 +# +# Commangs: +# +# translate lang +# Add a language to translate into +# E.g.: translate ru +# +# add +# Add a new podcast +# You will be prompted for the name, language, and URL of the podcast + +import lib.config as cfg +import json +import sys + +config = cfg.load() + +cmd = '' +try: + cmd = sys.argv[1] +except: + pass + +# Command to add a language to translate (in addition to generating a transcript) +if cmd == 'translate': + try: + lang = sys.argv[2] + except: + sys.stderr.write("Must specify the language\n") + sys.exit(2) + config['translate'].append(lang) + cfg.save(config) + sys.exit(0) + +# Command to add podcast (Name, Language, URL) +elif cmd == 'add': + podcast = {} + print('Podcast name: ', end=None) + podcast['name'] = sys.stdin.readline().strip() + print('Language: ', end=None) + podcast['lang'] = sys.stdin.readline().strip() + print('RSS URL: ', end=None) + podcast['url'] = sys.stdin.readline().strip() + config['podcasts'].append(podcast) + configFile = open('config.json', 'w', encoding='utf-8') + json.dump(config, configFile, indent=4) + configFile.write('\n') + configFile.close() + sys.exit(0) +else: + print("Unrecognised command: '" + cmd + "'. Usage:") + print("") + print("./update-config translate lang") + print(" E.g. ./update-config translate de") + print(" Translate podcasts in the specified language") + print(" For the list of supported languages, check Whisper") + print("") + print("./update-config add") + print(" Add a new podcast which is to be transcribed and possibly translated") + print(" This is interactive and will ask for the name, language, and RSS URL for the podcast") + sys.exit(1) + +