First version, seems to work
This commit is contained in:
commit
81ffe5a50f
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
podcasts/*
|
||||
__pycache__
|
||||
config.json
|
72
lib/args.py
Normal file
72
lib/args.py
Normal file
|
@ -0,0 +1,72 @@
|
|||
import re
|
||||
import sys
|
||||
|
||||
isNumber = re.compile('^[0-9]+$')
|
||||
|
||||
class Options:
|
||||
loadFeed = True
|
||||
downloadEpisodes = True
|
||||
generate = True
|
||||
filter = None
|
||||
episodes = 'new'
|
||||
numEpisodes = 1
|
||||
|
||||
def outputHelp():
|
||||
print("Usage: ./process [args] [filter [age] [episodes]]")
|
||||
print("")
|
||||
print("args:")
|
||||
print("--help")
|
||||
print(" Show this help information")
|
||||
print("--no-feed, --skip-feed")
|
||||
print(" Skip the initial step of downloading the RSS feeds")
|
||||
print("--no-download, --skip-download")
|
||||
print(" Skip downloading podcast episodes")
|
||||
print("--no-gen, --skip-gen")
|
||||
print(" Skip generating transcripts and translations of podcast episodes")
|
||||
print("")
|
||||
print("filter:")
|
||||
print(" If specified, only configured podcasts which match the filter will be processed")
|
||||
print(" A filter of 'pieds sur' would match 'Les Pieds sur terre'")
|
||||
print("")
|
||||
print("age:")
|
||||
print(" 'new' (default), or 'old'")
|
||||
print(" Fetch the newest or oldest episode(s) for the matching podcast(s)")
|
||||
print("")
|
||||
print("episodes:")
|
||||
print(" 1 by default")
|
||||
print(" The number of episode(s) to fetch for the matching podcast(s)")
|
||||
print("")
|
||||
print("Examples:")
|
||||
print(" ./process --help")
|
||||
print(" ./process --no-feed --skip-download")
|
||||
print(" Only process previously downloaded episodes (useful e.g. if Whisper ran out of memory,")
|
||||
print(" or a podcast wasn't configured to translate episodes)")
|
||||
print(" ./process 'the few' old 3")
|
||||
print(" Download and process the first 3 episodes of 'The Few Who Do'")
|
||||
|
||||
def read():
|
||||
opts = Options()
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == 'help' or arg =='--help':
|
||||
outputHelp()
|
||||
sys.exit(0)
|
||||
elif arg == '--no-feed' or arg == '--skip-feed':
|
||||
opts.loadFeed = False
|
||||
elif arg == '--no-download' or arg == '--skip-download':
|
||||
opts.downloadEpisodes = False
|
||||
elif arg == "--no-gen" or arg == "--skip-gen":
|
||||
opts.generate = False
|
||||
elif arg == 'new':
|
||||
opts.episodes = 'new'
|
||||
elif arg == 'old':
|
||||
opts.episodes = 'old'
|
||||
elif isNumber.match(arg):
|
||||
opts.numEpisodes = int(arg)
|
||||
elif opts.filter == None:
|
||||
opts.filter = arg.lower()
|
||||
else:
|
||||
print(f"Unrecognised argument: {arg}")
|
||||
print("")
|
||||
outputHelp()
|
||||
sys.exit(1)
|
||||
return opts
|
19
lib/config.py
Normal file
19
lib/config.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
import json
|
||||
|
||||
def load():
|
||||
try:
|
||||
configFile = open('config.json', 'r', encoding='utf-8')
|
||||
config = json.load(configFile)
|
||||
configFile.close()
|
||||
except:
|
||||
config = {
|
||||
'translate': [],
|
||||
'podcasts': [],
|
||||
}
|
||||
return config
|
||||
|
||||
def save(configData):
|
||||
configFile = open('config.json', 'w', encoding='utf-8')
|
||||
json.dump(configData, configFile, indent=4)
|
||||
configFile.write('\n')
|
||||
configFile.close()
|
16
lib/fetcher.py
Normal file
16
lib/fetcher.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
import urllib.request
|
||||
|
||||
urllib.request.build_opener()
|
||||
|
||||
# Simple wrapper to spoof User-Agent when fetching remote files
|
||||
def download(remoteUrl, localFilename):
|
||||
localFile = open(localFilename, 'b+w')
|
||||
opener = urllib.request.build_opener()
|
||||
opener.addheaders = [('User-agent', 'Podkastomat/0.1.0')]
|
||||
remoteFile = opener.open(remoteUrl)
|
||||
localFile.write(remoteFile.read())
|
||||
|
||||
|
||||
|
||||
|
||||
|
117
lib/files.py
Normal file
117
lib/files.py
Normal file
|
@ -0,0 +1,117 @@
|
|||
import glob
|
||||
import re
|
||||
import lib.fetcher as fetcher
|
||||
import os
|
||||
import os.path
|
||||
|
||||
from dateutil import parser as dateparser
|
||||
|
||||
charsToRemove = re.compile('[^\w\-]*')
|
||||
multipleUnderscores = re.compile('__+')
|
||||
startingNumerals = re.compile(r'^#?([0-9]+)')
|
||||
|
||||
def addRef(podcast):
|
||||
podcast['ref'] = getRef(podcast['name'])
|
||||
podcast['dir'] = f"podcasts/{podcast['lang']}/{podcast['ref']}"
|
||||
|
||||
def getRef(name):
|
||||
ref = name.lower()
|
||||
ref = ref.replace(' ', '_')
|
||||
ref = charsToRemove.sub('', ref)
|
||||
ref = multipleUnderscores.sub('_', ref)
|
||||
return ref
|
||||
|
||||
def getExtension(fileName):
|
||||
urlAndParams = fileName.partition('?')
|
||||
parts = urlAndParams[0].split('.')
|
||||
return parts.pop()
|
||||
|
||||
def replaceExtension(fileName, newExtension):
|
||||
parts = fileName.split('.')
|
||||
parts.pop()
|
||||
return '.'.join(parts) + '.' + newExtension
|
||||
|
||||
def saveEpisode(podcast, episode):
|
||||
match = startingNumerals.match(episode['title'])
|
||||
if episode['episode']:
|
||||
# Episode (& possibly season) number encoded in the RSS
|
||||
epNum = episode['episode']
|
||||
openingRef = epNum.zfill(5)
|
||||
identifier = f"{epNum}"
|
||||
if episode['season']:
|
||||
seasonNum = episode['season']
|
||||
openingRef = 's' + seasonNum.zfill(3) + 'ep' + openingRef
|
||||
identifier += f" of season {seasonNum}"
|
||||
offset = 0
|
||||
elif match:
|
||||
# Episode number deduced from its title
|
||||
epNum = match.group(1)
|
||||
openingRef = epNum.zfill(5)
|
||||
offset = len(match.group(0))
|
||||
identifier = epNum
|
||||
elif "date" in episode:
|
||||
# Podcast (or part thereof) where episodes only have a name and date
|
||||
date = dateparser.parse(episode['date'])
|
||||
openingRef = date.strftime("%Y-%m-%d_%H%M")
|
||||
offset = 0
|
||||
identifier = date.strftime("from %d %b %Y at %H:%M")
|
||||
else:
|
||||
print(f"Unable to determine identifier for episode:\n{episode}")
|
||||
return
|
||||
filenamePattern = f"{podcast['dir']}/{openingRef}_*"
|
||||
extantFiles = glob.glob(filenamePattern)
|
||||
if len(extantFiles) == 0:
|
||||
# Download new file
|
||||
ext = getExtension(episode['url'])
|
||||
episodeName = openingRef + '_' + episode['title'][offset:]
|
||||
if episode['subtitle']:
|
||||
episodeName += ' - ' + episode['subtitle']
|
||||
episodeRef = getRef(episodeName)
|
||||
fileName = f"{episodeRef}.{ext}"
|
||||
print(f"Downloading episode {identifier} of {podcast['name']} as {fileName}")
|
||||
fetcher.download(episode['url'], f"{podcast['dir']}/{fileName}")
|
||||
else:
|
||||
print(f"Episode {identifier} of {podcast['name']} has been downloaded previously; skipping")
|
||||
|
||||
def findAudio():
|
||||
return glob.glob('podcasts/*/*/*.m[4p][a3]')
|
||||
|
||||
def getLangCode(audioFile):
|
||||
parts = audioFile.split('/')
|
||||
return parts[1]
|
||||
|
||||
def getTranscriptFilename(audioFile):
|
||||
return replaceExtension(audioFile, 'transcript.txt')
|
||||
|
||||
def hasTranscript(audioFile):
|
||||
transcriptFile = getTranscriptFilename(audioFile)
|
||||
return os.path.isfile(transcriptFile)
|
||||
|
||||
def getTranslationFilename(audioFile):
|
||||
return replaceExtension(audioFile, 'translation.txt')
|
||||
|
||||
def hasTranslation(audioFile):
|
||||
translationFile = getTranslationFilename(audioFile)
|
||||
return os.path.isfile(translationFile)
|
||||
|
||||
def generateFromAudio(audioFile, task):
|
||||
if task == 'transcribe':
|
||||
newExt = 'transcript.txt'
|
||||
else:
|
||||
task = 'translate'
|
||||
newExt = 'translation.txt'
|
||||
|
||||
langCode = getLangCode(audioFile)
|
||||
fileParts = audioFile.split('/')
|
||||
fileName = fileParts.pop()
|
||||
dir = '/'.join(fileParts)
|
||||
os.chdir(dir)
|
||||
|
||||
cmd = f"whisper {fileName} --model medium --language {langCode} --task {task} --output_format vtt --fp16 False"
|
||||
os.system(cmd)
|
||||
|
||||
# rename transcript/translation file generated
|
||||
generatedFile = replaceExtension(fileName, 'vtt')
|
||||
newFileName = replaceExtension(fileName, newExt)
|
||||
os.rename(generatedFile, newFileName)
|
||||
os.chdir('../..')
|
54
lib/rss.py
Normal file
54
lib/rss.py
Normal file
|
@ -0,0 +1,54 @@
|
|||
import lib.fetcher as fetcher
|
||||
import os.path
|
||||
from xml.dom import minidom
|
||||
|
||||
namespaces = {'itunes': 'itunes.com'}
|
||||
|
||||
def fetch(podcast, loadFeed):
|
||||
podcast['feed'] = podcast['dir'] + "/rss.xml"
|
||||
if not os.path.isdir(podcast['dir']):
|
||||
os.makedirs(podcast['dir'])
|
||||
if loadFeed:
|
||||
print(f"Downloading RSS for podcast {podcast['name']}")
|
||||
fetcher.download(podcast['url'], podcast['feed'])
|
||||
|
||||
# Get the text value of a child name with a given name, if available
|
||||
def extractField(node, childNodeName):
|
||||
children = node.getElementsByTagName(childNodeName)
|
||||
if children.length == 0:
|
||||
return ''
|
||||
return children[0].firstChild.data
|
||||
|
||||
def getEpisodes(podcast, episodesFrom, numEpisodes):
|
||||
print(f"Getting latest episode from {podcast['name']}")
|
||||
doc = minidom.parse(podcast['feed'])
|
||||
root = doc.getElementsByTagName('channel')[0]
|
||||
epNodes = root.getElementsByTagName('item')
|
||||
epNodeNum = 0
|
||||
totalEpisodes = epNodes.length
|
||||
if episodesFrom == 'old':
|
||||
epNodeNum = totalEpisodes - 1
|
||||
episodes = []
|
||||
for i in range (0, numEpisodes):
|
||||
epNode = epNodes.item(epNodeNum)
|
||||
episode = {
|
||||
'title': extractField(epNode, 'title'),
|
||||
'subtitle': '',
|
||||
'url': epNode.getElementsByTagName('enclosure')[0].getAttribute('url'),
|
||||
'season': extractField(epNode, 'itunes:season'),
|
||||
'episode': extractField(epNode, 'itunes:episode')
|
||||
}
|
||||
subtitle = extractField(epNode, 'itunes:subtitle')
|
||||
if len(episode['title']) + len(subtitle) < 150:
|
||||
episode['subtitle'] = subtitle
|
||||
pubDate = extractField(epNode, 'pubDate')
|
||||
if pubDate:
|
||||
episode['date'] = pubDate
|
||||
episodes.append(episode)
|
||||
if episodesFrom == 'old':
|
||||
epNodeNum -= 1
|
||||
else:
|
||||
epNodeNum += 1
|
||||
if epNodeNum == -1 or epNodeNum == totalEpisodes:
|
||||
break
|
||||
return episodes
|
35
process
Executable file
35
process
Executable file
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
import inspect
|
||||
import lib.args as args
|
||||
import lib.config as cfg
|
||||
import lib.files as files
|
||||
import lib.rss as rss
|
||||
|
||||
config = cfg.load()
|
||||
options = args.read()
|
||||
|
||||
for podcast in config['podcasts']:
|
||||
if options.filter and not options.filter in podcast['name'].lower():
|
||||
continue
|
||||
files.addRef(podcast)
|
||||
rss.fetch(podcast, options.loadFeed)
|
||||
if options.downloadEpisodes:
|
||||
eps = rss.getEpisodes(podcast, options.episodes, options.numEpisodes)
|
||||
for ep in eps:
|
||||
print(ep)
|
||||
files.saveEpisode(podcast, ep)
|
||||
|
||||
if not options.generate:
|
||||
exit(0)
|
||||
audioFiles = files.findAudio()
|
||||
for audioFile in audioFiles:
|
||||
print(f"Audio: {audioFile}")
|
||||
continue
|
||||
language = files.getLangCode(audioFile)
|
||||
if not files.hasTranscript(audioFile):
|
||||
files.generateFromAudio(audioFile, 'transcribe')
|
||||
if language in config['translate'] and not files.hasTranslation(audioFile):
|
||||
files.generateFromAudio(audioFile, 'translate')
|
||||
|
||||
|
64
update-config
Executable file
64
update-config
Executable file
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/python3
|
||||
#
|
||||
# Commangs:
|
||||
#
|
||||
# translate lang
|
||||
# Add a language to translate into
|
||||
# E.g.: translate ru
|
||||
#
|
||||
# add
|
||||
# Add a new podcast
|
||||
# You will be prompted for the name, language, and URL of the podcast
|
||||
|
||||
import lib.config as cfg
|
||||
import json
|
||||
import sys
|
||||
|
||||
config = cfg.load()
|
||||
|
||||
cmd = ''
|
||||
try:
|
||||
cmd = sys.argv[1]
|
||||
except:
|
||||
pass
|
||||
|
||||
# Command to add a language to translate (in addition to generating a transcript)
|
||||
if cmd == 'translate':
|
||||
try:
|
||||
lang = sys.argv[2]
|
||||
except:
|
||||
sys.stderr.write("Must specify the language\n")
|
||||
sys.exit(2)
|
||||
config['translate'].append(lang)
|
||||
cfg.save(config)
|
||||
sys.exit(0)
|
||||
|
||||
# Command to add podcast (Name, Language, URL)
|
||||
elif cmd == 'add':
|
||||
podcast = {}
|
||||
print('Podcast name: ', end=None)
|
||||
podcast['name'] = sys.stdin.readline().strip()
|
||||
print('Language: ', end=None)
|
||||
podcast['lang'] = sys.stdin.readline().strip()
|
||||
print('RSS URL: ', end=None)
|
||||
podcast['url'] = sys.stdin.readline().strip()
|
||||
config['podcasts'].append(podcast)
|
||||
configFile = open('config.json', 'w', encoding='utf-8')
|
||||
json.dump(config, configFile, indent=4)
|
||||
configFile.write('\n')
|
||||
configFile.close()
|
||||
sys.exit(0)
|
||||
else:
|
||||
print("Unrecognised command: '" + cmd + "'. Usage:")
|
||||
print("")
|
||||
print("./update-config translate lang")
|
||||
print(" E.g. ./update-config translate de")
|
||||
print(" Translate podcasts in the specified language")
|
||||
print(" For the list of supported languages, check Whisper")
|
||||
print("")
|
||||
print("./update-config add")
|
||||
print(" Add a new podcast which is to be transcribed and possibly translated")
|
||||
print(" This is interactive and will ask for the name, language, and RSS URL for the podcast")
|
||||
sys.exit(1)
|
||||
|
||||
|
Loading…
Reference in a new issue