pentabarf.py 7.71 KB
Newer Older
Stefano Rivera's avatar
Stefano Rivera committed
1 2 3 4 5 6 7
#!/usr/bin/env python3

# URLs to XML for DebConf 7..13
# http://penta.debconf.org/dc7_schedule/schedule.en.xml

import argparse
import re
8
from datetime import datetime
Stefano Rivera's avatar
Stefano Rivera committed
9
from pathlib import Path
10
from urllib.parse import urljoin
Stefano Rivera's avatar
Stefano Rivera committed
11 12 13 14 15
from xml.etree import ElementTree

import dateutil.parser
import requests

Stefano Rivera's avatar
Stefano Rivera committed
16
from utils.files import dc11_filename, files_with_prefix, penta_filename
Stefano Rivera's avatar
Stefano Rivera committed
17
from utils.objects import Conference, Event, Meta, VideoFormat
18
from utils.pentabarf import parse_time, parse_timedelta
Stefano Rivera's avatar
Stefano Rivera committed
19 20 21
from utils.yaml import yaml_dump


Stefano Rivera's avatar
Stefano Rivera committed
22
def scrape(url, args):
Stefano Rivera's avatar
Stefano Rivera committed
23
    r = requests.get(url, verify=args.verify)
Stefano Rivera's avatar
Stefano Rivera committed
24 25
    tree = ElementTree.fromstring(r.content)

Stefano Rivera's avatar
Stefano Rivera committed
26 27
    conference = scrape_conference(url, tree.find('conference'), args)
    videos = list(scrape_videos(tree, conference, args))
Stefano Rivera's avatar
Stefano Rivera committed
28
    videos.sort(key=lambda event: event.start)
Stefano Rivera's avatar
Stefano Rivera committed
29
    return Meta(
Stefano Rivera's avatar
Stefano Rivera committed
30
        conference=conference,
Stefano Rivera's avatar
Stefano Rivera committed
31
        videos=videos,
Stefano Rivera's avatar
Stefano Rivera committed
32 33 34 35 36 37 38 39 40 41 42 43 44
    )


def index_files(conference):
    video_prefix = conference.video_base[56:] + 'high/'
    index = {}
    for fn in files_with_prefix(video_prefix):
        name = Path(fn).stem
        title = name.split('_', 1)[1].lower()
        index[title] = name
    return index


Stefano Rivera's avatar
Stefano Rivera committed
45 46 47 48 49 50 51 52
def dc13_filename(id_):
    """debconf13's filenames are named after the event ID"""
    for filename in files_with_prefix(
            '2013/debconf13/archival/{}_'.format(id_)):
        return Path(filename).stem
    return None


Stefano Rivera's avatar
Stefano Rivera committed
53
def summit_filename(conference, title):
Stefano Rivera's avatar
Stefano Rivera committed
54 55 56 57 58 59
    # Copied from veyepar's dj/main/unique_slugify.py
    fname = ''.join(
            [c for c in title if c.isalpha() or c.isdigit() or (c in ' _')])
    fname = fname.replace(' ', '_')
    fname = '_'.join([w for w in fname.split('_') if w])

Stefano Rivera's avatar
Stefano Rivera committed
60 61 62 63 64 65 66
    if conference.series == 'DebConf':
        if conference.edition == 14:
            base = '2014/debconf14/webm/{}.webm'
        if conference.edition == 15:
            base = '2015/debconf15/{}.webm'

    for filename in files_with_prefix(base.format(fname)):
Stefano Rivera's avatar
Stefano Rivera committed
67 68 69 70
        return Path(filename).stem
    return None


Stefano Rivera's avatar
Stefano Rivera committed
71
def scrape_videos(tree, conference, args):
Stefano Rivera's avatar
Stefano Rivera committed
72
    seen = set()
Stefano Rivera's avatar
Stefano Rivera committed
73 74 75 76
    for day in tree.iter('day'):
        date = dateutil.parser.parse(day.attrib['date']).date()
        print('Date:', date)

77
        for penta_event in day.iter('event'):
78
            id_ = penta_event.attrib['id']
Stefano Rivera's avatar
Stefano Rivera committed
79 80 81
            if id_ in seen:
                continue
            seen.add(id_)
Stefano Rivera's avatar
Stefano Rivera committed
82
            title = penta_event.findtext('title')
83

84
            video = penta_filename(id_)
85 86 87 88 89
            if conference.series == 'DebConf':
                if conference.edition == 11:
                    video = dc11_filename(title)
                if conference.edition == 13:
                    video = dc13_filename(id_)
Stefano Rivera's avatar
Stefano Rivera committed
90 91
                if conference.edition in (14, 15):
                    video = summit_filename(conference, title)
Stefano Rivera's avatar
Stefano Rivera committed
92

93 94 95
            if not video:
                continue

96 97 98 99 100 101 102 103 104
            description = [
                penta_event.findtext('subtitle').strip(),
                penta_event.findtext('abstract').strip(),
                penta_event.findtext('description').strip(),
            ]
            description = '\n\n'.join(
                section for section in description if section)
            description = description.replace('\r\n', '\n')

105 106 107 108 109 110 111
            details_url = penta_event.findtext('conf_url')
            if details_url:
                details_url = urljoin(conference.schedule, details_url)
            else:
                details_url = urljoin(
                    conference.schedule, 'events/{}.en.html'.format(id_))

Stefano Rivera's avatar
Stefano Rivera committed
112
            start = datetime.combine(
Stefano Rivera's avatar
Stefano Rivera committed
113 114
                date, parse_time(penta_event.findtext('start')))
            end = start + parse_timedelta(penta_event.findtext('duration'))
Stefano Rivera's avatar
Stefano Rivera committed
115 116
            event = Event(
                title=title,
117
                description=description,
118
                details_url=details_url,
Stefano Rivera's avatar
Stefano Rivera committed
119 120
                speakers=[
                    person.text for person in penta_event.iter('person')],
Stefano Rivera's avatar
Stefano Rivera committed
121
                room=penta_event.findtext('room'),
Stefano Rivera's avatar
Stefano Rivera committed
122 123
                start=start,
                end=end,
Stefano Rivera's avatar
Stefano Rivera committed
124
                video='high/{}.ogv'.format(video),
Stefano Rivera's avatar
Stefano Rivera committed
125
            )
126
            if conference.series == 'DebConf':
Stefano Rivera's avatar
Stefano Rivera committed
127 128 129 130
                if conference.edition < 14:
                    event.alt_formats = {
                        'low': 'low/{}.ogv'.format(video),
                    }
131 132 133 134 135 136 137 138 139 140 141 142
                if conference.edition < 9:
                    event.video = event.video[:-1] + 'g'
                    for format_, path in event.alt_formats.items():
                        event.alt_formats[format_] = path[:-1] + 'g'
                if conference.edition == 7:
                    event.alt_formats.update({
                        'mpeg-ntsc': 'ntsc-dvd/{}.mpeg'.format(video),
                        'mpeg-pal': 'pal-dvd/{}.mpeg'.format(video),
                    })
                if conference.edition == 13:
                    event.video = 'archival/{}.ogv'.format(video)
                    event.alt_formats['high'] = 'high/{}.ogv'.format(video)
Stefano Rivera's avatar
Stefano Rivera committed
143 144
                if conference.edition == 14:
                    event.video = 'webm/{}.webm'.format(video)
Stefano Rivera's avatar
Stefano Rivera committed
145 146
                if conference.edition == 15:
                    event.video = '{}.webm'.format(video)
Stefano Rivera's avatar
Stefano Rivera committed
147

Stefano Rivera's avatar
Stefano Rivera committed
148
            penta_lang = penta_event.findtext('language')
Stefano Rivera's avatar
Stefano Rivera committed
149 150 151
            if penta_lang:
                event.language = {
                    'English': 'eng',
152
                    'en': 'eng',
Stefano Rivera's avatar
Stefano Rivera committed
153
                    'es': 'spa',
Stefano Rivera's avatar
Stefano Rivera committed
154
                    'fr': 'fra',
Stefano Rivera's avatar
Stefano Rivera committed
155 156 157 158 159
                }[penta_lang]

            yield event


Stefano Rivera's avatar
Stefano Rivera committed
160
def scrape_conference(url, penta_conf, args):
Stefano Rivera's avatar
Stefano Rivera committed
161
    title = penta_conf.findtext('title')
Stefano Rivera's avatar
Stefano Rivera committed
162
    edition = int(re.search('\d+$', title).group(0))
Stefano Rivera's avatar
Stefano Rivera committed
163 164
    start = dateutil.parser.parse(penta_conf.findtext('start')).date()
    end = dateutil.parser.parse(penta_conf.findtext('end')).date()
Stefano Rivera's avatar
Stefano Rivera committed
165
    year = 2000 + edition
Stefano Rivera's avatar
Stefano Rivera committed
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
    video_formats = {}

    if edition < 14:
        video_formats['default'] = VideoFormat(
            resolution='720x576',
            bitrate='1700k',
            container='ogg',
            vcodec='theora',
            acodec='vorbis',
        )
        video_formats['low'] = VideoFormat(
            resolution='320x240',
            bitrate='300k',
            container='ogg',
            vcodec='theora',
            acodec='vorbis',
        )
    else:
        video_formats['default'] = VideoFormat(
            resolution='720x576',
            bitrate='1m',
            container='matroska',
            vcodec='vp8',
            acodec='vorbis',
        )

192
    if edition == 7:
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
        video_formats['mpeg-ntsc'] = VideoFormat(
            resolution='352x240',
            bitrate='400k',
            container='mpeg1',
            vcodec='mpeg1',
            acodec='ac3',
        )
        video_formats['mpeg-pal'] = VideoFormat(
            resolution='352x240',
            bitrate='400k',
            container='mpeg1',
            vcodec='mpeg1',
            acodec='mp2',
        )

Stefano Rivera's avatar
Stefano Rivera committed
208
    conf = Conference(
Stefano Rivera's avatar
Stefano Rivera committed
209 210 211 212 213 214 215 216
        title=title,
        series='DebConf',
        edition=edition,
        date=[start, end],
        website='https://debconf{}.debconf.org/'.format(edition),
        schedule=url,
        video_base='https://meetings-archive.debian.net/pub/debian-meetings/'
                   '{}/debconf{}/'.format(year, edition),
217
        video_formats=video_formats,
Stefano Rivera's avatar
Stefano Rivera committed
218 219
    )

Stefano Rivera's avatar
Stefano Rivera committed
220 221
    if penta_conf.findtext('city'):
        conf.location = penta_conf.findtext('city')
Stefano Rivera's avatar
Stefano Rivera committed
222 223 224

    return conf

Stefano Rivera's avatar
Stefano Rivera committed
225 226 227 228

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('url', help='Pentabarf XML feed URL')
Stefano Rivera's avatar
Stefano Rivera committed
229 230 231
    parser.add_argument('-k', '--no-verify',
                        action='store_false', dest='verify',
                        help='Disable SSL verification')
Stefano Rivera's avatar
Stefano Rivera committed
232 233
    args = parser.parse_args()

Stefano Rivera's avatar
Stefano Rivera committed
234
    meta = scrape(args.url, args)
Stefano Rivera's avatar
Stefano Rivera committed
235 236 237 238 239 240 241

    with open('scraped.yml', 'w') as f:
        yaml_dump(meta, f)


if __name__ == '__main__':
    main()