voicerepublic.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..compat import (
  5. compat_urllib_request,
  6. compat_urlparse,
  7. )
  8. from ..utils import (
  9. ExtractorError,
  10. determine_ext,
  11. int_or_none,
  12. )
  13. class VoiceRepublicIE(InfoExtractor):
  14. _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
  15. _TESTS = [{
  16. 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
  17. 'md5': '0554a24d1657915aa8e8f84e15dc9353',
  18. 'info_dict': {
  19. 'id': '2296',
  20. 'display_id': 'watching-the-watchers-building-a-sousveillance-state',
  21. 'ext': 'm4a',
  22. 'title': 'Watching the Watchers: Building a Sousveillance State',
  23. 'description': 'md5:715ba964958afa2398df615809cfecb1',
  24. 'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
  25. 'duration': 1800,
  26. 'view_count': int,
  27. }
  28. }, {
  29. 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
  30. 'only_matching': True,
  31. }]
  32. def _real_extract(self, url):
  33. display_id = self._match_id(url)
  34. req = compat_urllib_request.Request(
  35. compat_urlparse.urljoin(url, '/talks/%s' % display_id))
  36. # Older versions of Firefox get redirected to an "upgrade browser" page
  37. req.add_header('User-Agent', 'youtube-dl')
  38. webpage = self._download_webpage(req, display_id)
  39. if '>Queued for processing, please stand by...<' in webpage:
  40. raise ExtractorError(
  41. 'Audio is still queued for processing', expected=True)
  42. data = self._parse_json(
  43. self._search_regex(
  44. r'(?s)return ({.+?});\s*\n', webpage,
  45. 'data', default=None),
  46. display_id, fatal=False)
  47. if data:
  48. title = data['title']
  49. description = data.get('teaser')
  50. talk_id = data.get('talk_id') or display_id
  51. talk = data['talk']
  52. duration = int_or_none(talk.get('duration'))
  53. formats = [{
  54. 'url': compat_urlparse.urljoin(url, talk_url),
  55. 'format_id': format_id,
  56. 'ext': determine_ext(talk_url) or format_id,
  57. 'vcodec': 'none',
  58. } for format_id, talk_url in talk['links'].items()]
  59. else:
  60. title = self._og_search_title(webpage)
  61. description = self._html_search_regex(
  62. r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>",
  63. webpage, 'description', fatal=False)
  64. talk_id = self._search_regex(
  65. [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"],
  66. webpage, 'talk id', default=None) or display_id
  67. duration = None
  68. formats = [{
  69. 'url': compat_urlparse.urljoin(url, talk_url),
  70. 'format_id': format_id,
  71. 'ext': determine_ext(talk_url) or format_id,
  72. 'vcodec': 'none',
  73. } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", webpage)]
  74. self._sort_formats(formats)
  75. thumbnail = self._og_search_thumbnail(webpage)
  76. view_count = int_or_none(self._search_regex(
  77. r"class='play-count[^']*'>\s*(\d+) plays",
  78. webpage, 'play count', fatal=False))
  79. return {
  80. 'id': talk_id,
  81. 'display_id': display_id,
  82. 'title': title,
  83. 'description': description,
  84. 'thumbnail': thumbnail,
  85. 'duration': duration,
  86. 'view_count': view_count,
  87. 'formats': formats,
  88. }