cctv.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import compat_str
  6. from ..utils import (
  7. float_or_none,
  8. try_get,
  9. unified_timestamp,
  10. )
  11. class CCTVIE(InfoExtractor):
  12. IE_DESC = '央视网'
  13. _VALID_URL = r'https?://(?:[^/]+)\.(?:cntv|cctv)\.(?:com|cn)/(?:[^/]+/)*?(?P<id>[^/?#&]+?)(?:/index)?(?:\.s?html|[?#&]|$)'
  14. _TESTS = [{
  15. 'url': 'http://sports.cntv.cn/2016/02/12/ARTIaBRxv4rTT1yWf1frW2wi160212.shtml',
  16. 'md5': 'd61ec00a493e09da810bf406a078f691',
  17. 'info_dict': {
  18. 'id': '5ecdbeab623f4973b40ff25f18b174e8',
  19. 'ext': 'mp4',
  20. 'title': '[NBA]二少联手砍下46分 雷霆主场击败鹈鹕(快讯)',
  21. 'description': 'md5:7e14a5328dc5eb3d1cd6afbbe0574e95',
  22. 'duration': 98,
  23. 'uploader': 'songjunjie',
  24. 'timestamp': 1455279956,
  25. 'upload_date': '20160212',
  26. },
  27. }, {
  28. 'url': 'http://tv.cctv.com/2016/02/05/VIDEUS7apq3lKrHG9Dncm03B160205.shtml',
  29. 'info_dict': {
  30. 'id': 'efc5d49e5b3b4ab2b34f3a502b73d3ae',
  31. 'ext': 'mp4',
  32. 'title': '[赛车]“车王”舒马赫恢复情况成谜(快讯)',
  33. 'description': '2月4日,蒙特泽莫罗透露了关于“车王”舒马赫恢复情况,但情况是否属实遭到了质疑。',
  34. 'duration': 37,
  35. 'uploader': 'shujun',
  36. 'timestamp': 1454677291,
  37. 'upload_date': '20160205',
  38. },
  39. 'params': {
  40. 'skip_download': True,
  41. },
  42. }, {
  43. 'url': 'http://english.cntv.cn/special/four_comprehensives/index.shtml',
  44. 'info_dict': {
  45. 'id': '4bb9bb4db7a6471ba85fdeda5af0381e',
  46. 'ext': 'mp4',
  47. 'title': 'NHnews008 ANNUAL POLITICAL SEASON',
  48. 'description': 'Four Comprehensives',
  49. 'duration': 60,
  50. 'uploader': 'zhangyunlei',
  51. 'timestamp': 1425385521,
  52. 'upload_date': '20150303',
  53. },
  54. 'params': {
  55. 'skip_download': True,
  56. },
  57. }, {
  58. 'url': 'http://cctv.cntv.cn/lm/tvseries_russian/yilugesanghua/index.shtml',
  59. 'info_dict': {
  60. 'id': 'b15f009ff45c43968b9af583fc2e04b2',
  61. 'ext': 'mp4',
  62. 'title': 'Путь,усыпанный космеями Серия 1',
  63. 'description': 'Путь, усыпанный космеями',
  64. 'duration': 2645,
  65. 'uploader': 'renxue',
  66. 'timestamp': 1477479241,
  67. 'upload_date': '20161026',
  68. },
  69. 'params': {
  70. 'skip_download': True,
  71. },
  72. }, {
  73. 'url': 'http://ent.cntv.cn/2016/01/18/ARTIjprSSJH8DryTVr5Bx8Wb160118.shtml',
  74. 'only_matching': True,
  75. }, {
  76. 'url': 'http://tv.cntv.cn/video/C39296/e0210d949f113ddfb38d31f00a4e5c44',
  77. 'only_matching': True,
  78. }, {
  79. 'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml',
  80. 'only_matching': True,
  81. }, {
  82. 'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml',
  83. 'only_matching': True,
  84. }, {
  85. 'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44',
  86. 'only_matching': True
  87. }]
  88. def _real_extract(self, url):
  89. video_id = self._match_id(url)
  90. webpage = self._download_webpage(url, video_id)
  91. video_id = self._search_regex(
  92. [r'var\s+guid\s*=\s*["\']([\da-fA-F]+)',
  93. r'videoCenterId["\']\s*,\s*["\']([\da-fA-F]+)',
  94. r'changePlayer\s*\(\s*["\']([\da-fA-F]+)',
  95. r'load[Vv]ideo\s*\(\s*["\']([\da-fA-F]+)'],
  96. webpage, 'video id')
  97. data = self._download_json(
  98. 'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do', video_id,
  99. query={
  100. 'pid': video_id,
  101. 'url': url,
  102. 'idl': 32,
  103. 'idlr': 32,
  104. 'modifyed': 'false',
  105. })
  106. title = data['title']
  107. formats = []
  108. video = data.get('video')
  109. if isinstance(video, dict):
  110. for quality, chapters_key in enumerate(('lowChapters', 'chapters')):
  111. video_url = try_get(
  112. video, lambda x: x[chapters_key][0]['url'], compat_str)
  113. if video_url:
  114. formats.append({
  115. 'url': video_url,
  116. 'format_id': 'http',
  117. 'quality': quality,
  118. 'preference': -1,
  119. })
  120. hls_url = try_get(data, lambda x: x['hls_url'], compat_str)
  121. if hls_url:
  122. hls_url = re.sub(r'maxbr=\d+&?', '', hls_url)
  123. formats.extend(self._extract_m3u8_formats(
  124. hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
  125. m3u8_id='hls', fatal=False))
  126. self._sort_formats(formats)
  127. uploader = data.get('editer_name')
  128. description = self._html_search_meta('description', webpage)
  129. timestamp = unified_timestamp(data.get('f_pgmtime'))
  130. duration = float_or_none(try_get(video, lambda x: x['totalLength']))
  131. return {
  132. 'id': video_id,
  133. 'title': title,
  134. 'description': description,
  135. 'uploader': uploader,
  136. 'timestamp': timestamp,
  137. 'duration': duration,
  138. 'formats': formats,
  139. }