crunchyroll.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. # encoding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import json
  5. import base64
  6. import zlib
  7. from hashlib import sha1
  8. from math import pow, sqrt, floor
  9. from .common import InfoExtractor
  10. from ..utils import (
  11. ExtractorError,
  12. compat_urllib_parse,
  13. compat_urllib_request,
  14. bytes_to_intlist,
  15. intlist_to_bytes,
  16. unified_strdate,
  17. clean_html,
  18. urlencode_postdata,
  19. )
  20. from ..aes import (
  21. aes_cbc_decrypt,
  22. inc,
  23. )
  24. class CrunchyrollIE(InfoExtractor):
  25. _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
  26. _TEST = {
  27. 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
  28. #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
  29. 'info_dict': {
  30. 'id': '645513',
  31. 'ext': 'flv',
  32. 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
  33. 'description': 'md5:2d17137920c64f2f49981a7797d275ef',
  34. 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
  35. 'uploader': 'Yomiuri Telecasting Corporation (YTV)',
  36. 'upload_date': '20131013',
  37. },
  38. 'params': {
  39. # rtmp
  40. 'skip_download': True,
  41. },
  42. }
  43. _FORMAT_IDS = {
  44. '360': ('60', '106'),
  45. '480': ('61', '106'),
  46. '720': ('62', '106'),
  47. '1080': ('80', '108'),
  48. }
  49. def _login(self):
  50. (username, password) = self._get_login_info()
  51. if username is None:
  52. return
  53. self.report_login()
  54. login_url = 'https://www.crunchyroll.com/?a=formhandler'
  55. data = urlencode_postdata({
  56. 'formname': 'RpcApiUser_Login',
  57. 'name': username,
  58. 'password': password,
  59. })
  60. login_request = compat_urllib_request.Request(login_url, data)
  61. login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
  62. self._download_webpage(login_request, None, False, 'Wrong login info')
  63. def _real_initialize(self):
  64. self._login()
  65. def _decrypt_subtitles(self, data, iv, id):
  66. data = bytes_to_intlist(data)
  67. iv = bytes_to_intlist(iv)
  68. id = int(id)
  69. def obfuscate_key_aux(count, modulo, start):
  70. output = list(start)
  71. for _ in range(count):
  72. output.append(output[-1] + output[-2])
  73. # cut off start values
  74. output = output[2:]
  75. output = list(map(lambda x: x % modulo + 33, output))
  76. return output
  77. def obfuscate_key(key):
  78. num1 = int(floor(pow(2, 25) * sqrt(6.9)))
  79. num2 = (num1 ^ key) << 5
  80. num3 = key ^ num1
  81. num4 = num3 ^ (num3 >> 3) ^ num2
  82. prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2)))
  83. shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
  84. # Extend 160 Bit hash to 256 Bit
  85. return shaHash + [0] * 12
  86. key = obfuscate_key(id)
  87. class Counter:
  88. __value = iv
  89. def next_value(self):
  90. temp = self.__value
  91. self.__value = inc(self.__value)
  92. return temp
  93. decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
  94. return zlib.decompress(decrypted_data)
  95. def _convert_subtitles_to_srt(self, subtitles):
  96. output = ''
  97. for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
  98. start = start.replace('.', ',')
  99. end = end.replace('.', ',')
  100. text = clean_html(text)
  101. text = text.replace('\\N', '\n')
  102. if not text:
  103. continue
  104. output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
  105. return output
  106. def _real_extract(self,url):
  107. mobj = re.match(self._VALID_URL, url)
  108. video_id = mobj.group('video_id')
  109. if mobj.group('prefix') == 'm':
  110. mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage')
  111. webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url')
  112. else:
  113. webpage_url = 'http://www.' + mobj.group('url')
  114. webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
  115. note_m = self._html_search_regex(r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='')
  116. if note_m:
  117. raise ExtractorError(note_m)
  118. mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
  119. if mobj:
  120. msg = json.loads(mobj.group('msg'))
  121. if msg.get('type') == 'error':
  122. raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
  123. video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
  124. video_title = re.sub(r' {2,}', ' ', video_title)
  125. video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
  126. if not video_description:
  127. video_description = None
  128. video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL)
  129. if video_upload_date:
  130. video_upload_date = unified_strdate(video_upload_date)
  131. video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL)
  132. playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
  133. playerdata_req = compat_urllib_request.Request(playerdata_url)
  134. playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
  135. playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  136. playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
  137. stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
  138. video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
  139. formats = []
  140. for fmt in re.findall(r'\?p([0-9]{3,4})=1', webpage):
  141. stream_quality, stream_format = self._FORMAT_IDS[fmt]
  142. video_format = fmt+'p'
  143. streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/')
  144. # urlencode doesn't work!
  145. streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality='+stream_quality+'&media%5Fid='+stream_id+'&video%5Fformat='+stream_format
  146. streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  147. streamdata_req.add_header('Content-Length', str(len(streamdata_req.data)))
  148. streamdata = self._download_webpage(streamdata_req, video_id, note='Downloading media info for '+video_format)
  149. video_url = self._search_regex(r'<host>([^<]+)', streamdata, 'video_url')
  150. video_play_path = self._search_regex(r'<file>([^<]+)', streamdata, 'video_play_path')
  151. formats.append({
  152. 'url': video_url,
  153. 'play_path': video_play_path,
  154. 'ext': 'flv',
  155. 'format': video_format,
  156. 'format_id': video_format,
  157. })
  158. subtitles = {}
  159. for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
  160. sub_page = self._download_webpage('http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id='+sub_id,\
  161. video_id, note='Downloading subtitles for '+sub_name)
  162. id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
  163. iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
  164. data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
  165. if not id or not iv or not data:
  166. continue
  167. id = int(id)
  168. iv = base64.b64decode(iv)
  169. data = base64.b64decode(data)
  170. subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
  171. lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
  172. if not lang_code:
  173. continue
  174. subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
  175. return {
  176. 'id': video_id,
  177. 'title': video_title,
  178. 'description': video_description,
  179. 'thumbnail': video_thumbnail,
  180. 'uploader': video_uploader,
  181. 'upload_date': video_upload_date,
  182. 'subtitles': subtitles,
  183. 'formats': formats,
  184. }