europa.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. from .common import InfoExtractor
  4. from ..compat import compat_urlparse
  5. from ..utils import (
  6. int_or_none,
  7. orderedSet,
  8. parse_duration,
  9. qualities,
  10. unified_strdate,
  11. xpath_text
  12. )
  13. class EuropaIE(InfoExtractor):
  14. _VALID_URL = r'https?://ec\.europa\.eu/avservices/video/player\.cfm\?.*?\bref=(?P<id>[A-Za-z0-9]+)'
  15. _TESTS = [{
  16. 'url': 'http://ec.europa.eu/avservices/video/player.cfm?ref=I107758',
  17. 'md5': '574f080699ddd1e19a675b0ddf010371',
  18. 'info_dict': {
  19. 'id': 'I107758',
  20. 'ext': 'mp4',
  21. 'title': 'TRADE - Wikileaks on TTIP',
  22. 'description': 'NEW LIVE EC Midday press briefing of 11/08/2015',
  23. 'thumbnail': 're:^https?://.*\.jpg$',
  24. 'upload_date': '20150811',
  25. 'duration': 34,
  26. 'view_count': int,
  27. 'formats': 'mincount:3',
  28. }
  29. }, {
  30. 'url': 'http://ec.europa.eu/avservices/video/player.cfm?sitelang=en&ref=I107786',
  31. 'only_matching': True,
  32. }]
  33. def _real_extract(self, url):
  34. video_id = self._match_id(url)
  35. playlist = self._download_xml(
  36. 'http://ec.europa.eu/avservices/video/player/playlist.cfm?ID=%s' % video_id, video_id)
  37. def get_item(type_, preference):
  38. items = {}
  39. for item in playlist.findall('./info/%s/item' % type_):
  40. lang, label = xpath_text(item, 'lg', default=None), xpath_text(item, 'label', default=None)
  41. if lang and label:
  42. items[lang] = label.strip()
  43. for p in preference:
  44. if items.get(p):
  45. return items[p]
  46. query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
  47. preferred_lang = query.get('sitelang', ('en', ))[0]
  48. preferred_langs = orderedSet((preferred_lang, 'en', 'int'))
  49. title = get_item('title', preferred_langs) or video_id
  50. description = get_item('description', preferred_langs)
  51. thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail')
  52. upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))
  53. duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))
  54. view_count = int_or_none(xpath_text(playlist,'./info/views', 'views'))
  55. language_preference = qualities(preferred_langs[::-1])
  56. formats = []
  57. for file_ in playlist.findall('./files/file'):
  58. video_url = xpath_text(file_, './url')
  59. if not video_url:
  60. continue
  61. lang = xpath_text(file_, './lg')
  62. formats.append({
  63. 'url': video_url,
  64. 'format_id': lang,
  65. 'format_note': xpath_text(file_, './lglabel'),
  66. 'language_preference': language_preference(lang)
  67. })
  68. self._sort_formats(formats)
  69. return {
  70. 'id': video_id,
  71. 'title': title,
  72. 'description': description,
  73. 'thumbnail': thumbnmail,
  74. 'upload_date': upload_date,
  75. 'duration': duration,
  76. 'view_count': view_count,
  77. 'formats': formats
  78. }