First version, seems to work

2025-01-21 22:59:56 +10:30 · 2025-01-21 22:59:56 +10:30 · 81ffe5a50f
commit 81ffe5a50f
8 changed files with 380 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+podcasts/*
+__pycache__
+config.json
--- a/lib/args.py
+++ b/lib/args.py
@ -0,0 +1,72 @@
+import re
+import sys
+
+isNumber = re.compile('^[0-9]+$')
+
+class Options:
+    loadFeed = True
+    downloadEpisodes = True
+    generate = True
+    filter = None
+    episodes = 'new'
+    numEpisodes = 1
+
+def outputHelp():
+    print("Usage: ./process [args] [filter [age] [episodes]]")
+    print("")
+    print("args:")
+    print("--help")
+    print("    Show this help information")
+    print("--no-feed, --skip-feed")
+    print("    Skip the initial step of downloading the RSS feeds")
+    print("--no-download, --skip-download")
+    print("    Skip downloading podcast episodes")
+    print("--no-gen, --skip-gen")
+    print("    Skip generating transcripts and translations of podcast episodes")
+    print("")
+    print("filter:")
+    print("    If specified, only configured podcasts which match the filter will be processed")
+    print("    A filter of 'pieds sur' would match 'Les Pieds sur terre'")
+    print("")
+    print("age:")
+    print("    'new' (default), or 'old'")
+    print("    Fetch the newest or oldest episode(s) for the matching podcast(s)")
+    print("")
+    print("episodes:")
+    print("    1 by default")
+    print("    The number of episode(s) to fetch for the matching podcast(s)")
+    print("")
+    print("Examples:")
+    print("    ./process --help")
+    print("    ./process --no-feed --skip-download")
+    print("        Only process previously downloaded episodes (useful e.g. if Whisper ran out of memory,")
+    print("        or a podcast wasn't configured to translate episodes)")
+    print("    ./process 'the few' old 3")
+    print("        Download and process the first 3 episodes of 'The Few Who Do'")
+
+def read():
+    opts = Options()
+    for arg in sys.argv[1:]:
+        if arg == 'help' or arg =='--help':
+            outputHelp()
+            sys.exit(0)
+        elif arg == '--no-feed' or arg == '--skip-feed':
+            opts.loadFeed = False
+        elif arg == '--no-download' or arg == '--skip-download':
+            opts.downloadEpisodes = False
+        elif arg == "--no-gen" or arg == "--skip-gen":
+            opts.generate = False
+        elif arg == 'new':
+            opts.episodes = 'new'
+        elif arg == 'old':
+            opts.episodes = 'old'
+        elif isNumber.match(arg):
+            opts.numEpisodes = int(arg)
+        elif opts.filter == None:
+            opts.filter = arg.lower()
+        else:
+            print(f"Unrecognised argument: {arg}")
+            print("")
+            outputHelp()
+            sys.exit(1)
+    return opts
--- a/lib/config.py
+++ b/lib/config.py
@ -0,0 +1,19 @@
+import json
+
+def load():
+    try:
+        configFile = open('config.json', 'r', encoding='utf-8')
+        config = json.load(configFile)
+        configFile.close()
+    except:
+        config = {
+            'translate': [],
+            'podcasts': [],
+        }
+    return config
+
+def save(configData):
+    configFile = open('config.json', 'w', encoding='utf-8')
+    json.dump(configData, configFile, indent=4)
+    configFile.write('\n')
+    configFile.close()
--- a/lib/fetcher.py
+++ b/lib/fetcher.py
@ -0,0 +1,16 @@
+import urllib.request
+
+urllib.request.build_opener()
+
+# Simple wrapper to spoof User-Agent when fetching remote files
+def download(remoteUrl, localFilename):
+    localFile = open(localFilename, 'b+w')
+    opener = urllib.request.build_opener()
+    opener.addheaders = [('User-agent', 'Podkastomat/0.1.0')]
+    remoteFile = opener.open(remoteUrl)
+    localFile.write(remoteFile.read())
+
+
+
+
+
--- a/lib/files.py
+++ b/lib/files.py
@ -0,0 +1,117 @@
+import glob
+import re
+import lib.fetcher as fetcher
+import os
+import os.path
+
+from dateutil import parser as dateparser
+
+charsToRemove = re.compile('[^\w\-]*')
+multipleUnderscores = re.compile('__+')
+startingNumerals = re.compile(r'^#?([0-9]+)')
+
+def addRef(podcast):
+    podcast['ref'] = getRef(podcast['name'])
+    podcast['dir'] = f"podcasts/{podcast['lang']}/{podcast['ref']}"
+
+def getRef(name):
+    ref = name.lower()
+    ref = ref.replace(' ', '_')
+    ref = charsToRemove.sub('', ref)
+    ref = multipleUnderscores.sub('_', ref)
+    return ref
+
+def getExtension(fileName):
+    urlAndParams = fileName.partition('?')
+    parts = urlAndParams[0].split('.')
+    return parts.pop()
+
+def replaceExtension(fileName, newExtension):
+    parts = fileName.split('.')
+    parts.pop()
+    return '.'.join(parts) + '.' + newExtension
+
+def saveEpisode(podcast, episode):
+    match = startingNumerals.match(episode['title'])
+    if episode['episode']:
+        # Episode (& possibly season) number encoded in the RSS
+        epNum = episode['episode']
+        openingRef = epNum.zfill(5)
+        identifier = f"{epNum}"
+        if episode['season']:
+            seasonNum = episode['season']
+            openingRef = 's' + seasonNum.zfill(3) + 'ep' + openingRef
+            identifier += f" of season {seasonNum}"
+        offset = 0
+    elif match:
+        # Episode number deduced from its title
+        epNum = match.group(1)
+        openingRef = epNum.zfill(5)
+        offset = len(match.group(0))
+        identifier = epNum
+    elif "date" in episode:
+        # Podcast (or part thereof) where episodes only have a name and date
+        date = dateparser.parse(episode['date'])
+        openingRef = date.strftime("%Y-%m-%d_%H%M")
+        offset = 0
+        identifier = date.strftime("from %d %b %Y at %H:%M")
+    else:
+        print(f"Unable to determine identifier for episode:\n{episode}")
+        return
+    filenamePattern = f"{podcast['dir']}/{openingRef}_*"
+    extantFiles = glob.glob(filenamePattern)
+    if len(extantFiles) == 0:
+        # Download new file
+        ext = getExtension(episode['url'])
+        episodeName = openingRef + '_' + episode['title'][offset:]
+        if episode['subtitle']:
+            episodeName += ' - ' + episode['subtitle']
+        episodeRef = getRef(episodeName)
+        fileName = f"{episodeRef}.{ext}"
+        print(f"Downloading episode {identifier} of {podcast['name']} as {fileName}")
+        fetcher.download(episode['url'], f"{podcast['dir']}/{fileName}")
+    else:
+        print(f"Episode {identifier} of {podcast['name']} has been downloaded previously; skipping")
+
+def findAudio():
+    return glob.glob('podcasts/*/*/*.m[4p][a3]')
+
+def getLangCode(audioFile):
+    parts = audioFile.split('/')
+    return parts[1]
+
+def getTranscriptFilename(audioFile):
+    return replaceExtension(audioFile, 'transcript.txt')
+
+def hasTranscript(audioFile):
+    transcriptFile = getTranscriptFilename(audioFile)
+    return os.path.isfile(transcriptFile)
+
+def getTranslationFilename(audioFile):
+    return replaceExtension(audioFile, 'translation.txt')
+
+def hasTranslation(audioFile):
+    translationFile = getTranslationFilename(audioFile)
+    return os.path.isfile(translationFile)
+
+def generateFromAudio(audioFile, task):
+    if task == 'transcribe':
+        newExt = 'transcript.txt'
+    else:
+        task = 'translate'
+        newExt = 'translation.txt'
+
+    langCode = getLangCode(audioFile)
+    fileParts = audioFile.split('/')
+    fileName = fileParts.pop()
+    dir = '/'.join(fileParts)
+    os.chdir(dir)
+
+    cmd = f"whisper {fileName} --model medium --language {langCode} --task {task} --output_format vtt --fp16 False"
+    os.system(cmd)
+
+    # rename transcript/translation file generated
+    generatedFile = replaceExtension(fileName, 'vtt')
+    newFileName = replaceExtension(fileName, newExt)
+    os.rename(generatedFile, newFileName)
+    os.chdir('../..')
--- a/lib/rss.py
+++ b/lib/rss.py
@ -0,0 +1,54 @@
+import lib.fetcher as fetcher
+import os.path
+from xml.dom import minidom
+
+namespaces = {'itunes': 'itunes.com'}
+
+def fetch(podcast, loadFeed):
+    podcast['feed'] = podcast['dir'] + "/rss.xml"
+    if not os.path.isdir(podcast['dir']):
+        os.makedirs(podcast['dir'])
+    if loadFeed:
+        print(f"Downloading RSS for podcast {podcast['name']}")
+        fetcher.download(podcast['url'], podcast['feed'])
+
+# Get the text value of a child name with a given name, if available
+def extractField(node, childNodeName):
+    children = node.getElementsByTagName(childNodeName)
+    if children.length == 0:
+        return ''
+    return children[0].firstChild.data
+
+def getEpisodes(podcast, episodesFrom, numEpisodes):
+    print(f"Getting latest episode from {podcast['name']}")
+    doc = minidom.parse(podcast['feed'])
+    root = doc.getElementsByTagName('channel')[0]
+    epNodes = root.getElementsByTagName('item')
+    epNodeNum = 0
+    totalEpisodes = epNodes.length
+    if episodesFrom == 'old':
+        epNodeNum = totalEpisodes - 1
+    episodes = []
+    for i in range (0, numEpisodes):
+        epNode = epNodes.item(epNodeNum)
+        episode = {
+            'title': extractField(epNode, 'title'),
+            'subtitle': '',
+            'url': epNode.getElementsByTagName('enclosure')[0].getAttribute('url'),
+            'season': extractField(epNode, 'itunes:season'),
+            'episode': extractField(epNode, 'itunes:episode')
+        }
+        subtitle = extractField(epNode, 'itunes:subtitle')
+        if len(episode['title']) + len(subtitle) < 150:
+            episode['subtitle'] = subtitle
+        pubDate = extractField(epNode, 'pubDate')
+        if pubDate:
+            episode['date'] = pubDate
+        episodes.append(episode)
+        if episodesFrom == 'old':
+            epNodeNum -= 1
+        else:
+            epNodeNum += 1
+        if epNodeNum == -1 or epNodeNum == totalEpisodes:
+            break
+    return episodes
--- a/35
+++ b/35
@ -0,0 +1,35 @@
+#!/usr/bin/python3
+
+import inspect
+import lib.args as args
+import lib.config as cfg
+import lib.files as files
+import lib.rss as rss
+
+config = cfg.load()
+options = args.read()
+
+for podcast in config['podcasts']:
+    if options.filter and not options.filter in podcast['name'].lower():
+        continue
+    files.addRef(podcast)
+    rss.fetch(podcast, options.loadFeed)
+    if options.downloadEpisodes:
+        eps = rss.getEpisodes(podcast, options.episodes, options.numEpisodes)
+        for ep in eps:
+            print(ep)
+            files.saveEpisode(podcast, ep)
+
+if not options.generate:
+    exit(0)
+audioFiles = files.findAudio()
+for audioFile in audioFiles:
+    print(f"Audio: {audioFile}")
+    continue
+    language = files.getLangCode(audioFile)
+    if not files.hasTranscript(audioFile):
+        files.generateFromAudio(audioFile, 'transcribe')
+    if language in config['translate'] and not files.hasTranslation(audioFile):
+        files.generateFromAudio(audioFile, 'translate')
+
+
--- a/64
+++ b/64
@ -0,0 +1,64 @@
+#!/usr/bin/python3
+#
+# Commangs:
+#
+# translate lang
+#     Add a language to translate into
+#     E.g.: translate ru
+#
+# add
+#     Add a new podcast
+#     You will be prompted for the name, language, and URL of the podcast
+
+import lib.config as cfg
+import json
+import sys
+
+config = cfg.load()
+
+cmd = ''
+try:
+    cmd = sys.argv[1]
+except:
+    pass
+
+# Command to add a language to translate (in addition to generating a transcript)
+if cmd == 'translate':
+    try:
+        lang = sys.argv[2]
+    except:
+        sys.stderr.write("Must specify the language\n")
+        sys.exit(2)
+    config['translate'].append(lang)
+    cfg.save(config)
+    sys.exit(0)
+
+# Command to add podcast (Name, Language, URL)
+elif cmd == 'add':
+    podcast = {}
+    print('Podcast name: ', end=None)
+    podcast['name'] = sys.stdin.readline().strip()
+    print('Language: ', end=None)
+    podcast['lang'] = sys.stdin.readline().strip()
+    print('RSS URL: ', end=None)
+    podcast['url'] = sys.stdin.readline().strip()
+    config['podcasts'].append(podcast)
+    configFile = open('config.json', 'w', encoding='utf-8')
+    json.dump(config, configFile, indent=4)
+    configFile.write('\n')
+    configFile.close()
+    sys.exit(0)
+else:
+    print("Unrecognised command: '" + cmd + "'. Usage:")
+    print("")
+    print("./update-config translate lang")
+    print("    E.g. ./update-config translate de")
+    print("    Translate podcasts in the specified language")
+    print("    For the list of supported languages, check Whisper")
+    print("")
+    print("./update-config add")
+    print("    Add a new podcast which is to be transcribed and possibly translated")
+    print("    This is interactive and will ask for the name, language, and RSS URL for the podcast")
+    sys.exit(1)
+
+