pornhub.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import functools
  4. import itertools
  5. import operator
  6. import re
  7. from .common import InfoExtractor
  8. from ..compat import (
  9. compat_HTTPError,
  10. compat_str,
  11. )
  12. from ..utils import (
  13. ExtractorError,
  14. int_or_none,
  15. js_to_json,
  16. orderedSet,
  17. remove_quotes,
  18. str_to_int,
  19. url_or_none,
  20. )
  21. class PornHubIE(InfoExtractor):
  22. IE_DESC = 'PornHub and Thumbzilla'
  23. _VALID_URL = r'''(?x)
  24. https?://
  25. (?:
  26. (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  27. (?:www\.)?thumbzilla\.com/video/
  28. )
  29. (?P<id>[\da-z]+)
  30. '''
  31. _TESTS = [{
  32. 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  33. 'md5': '1e19b41231a02eba417839222ac9d58e',
  34. 'info_dict': {
  35. 'id': '648719015',
  36. 'ext': 'mp4',
  37. 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  38. 'uploader': 'Babes',
  39. 'duration': 361,
  40. 'view_count': int,
  41. 'like_count': int,
  42. 'dislike_count': int,
  43. 'comment_count': int,
  44. 'age_limit': 18,
  45. 'tags': list,
  46. 'categories': list,
  47. },
  48. }, {
  49. # non-ASCII title
  50. 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  51. 'info_dict': {
  52. 'id': '1331683002',
  53. 'ext': 'mp4',
  54. 'title': '重庆婷婷女王足交',
  55. 'uploader': 'Unknown',
  56. 'duration': 1753,
  57. 'view_count': int,
  58. 'like_count': int,
  59. 'dislike_count': int,
  60. 'comment_count': int,
  61. 'age_limit': 18,
  62. 'tags': list,
  63. 'categories': list,
  64. },
  65. 'params': {
  66. 'skip_download': True,
  67. },
  68. }, {
  69. # subtitles
  70. 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
  71. 'info_dict': {
  72. 'id': 'ph5af5fef7c2aa7',
  73. 'ext': 'mp4',
  74. 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
  75. 'uploader': 'BFFs',
  76. 'duration': 622,
  77. 'view_count': int,
  78. 'like_count': int,
  79. 'dislike_count': int,
  80. 'comment_count': int,
  81. 'age_limit': 18,
  82. 'tags': list,
  83. 'categories': list,
  84. 'subtitles': {
  85. 'en': [{
  86. "ext": 'srt'
  87. }]
  88. },
  89. },
  90. 'params': {
  91. 'skip_download': True,
  92. },
  93. }, {
  94. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
  95. 'only_matching': True,
  96. }, {
  97. # removed at the request of cam4.com
  98. 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
  99. 'only_matching': True,
  100. }, {
  101. # removed at the request of the copyright owner
  102. 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
  103. 'only_matching': True,
  104. }, {
  105. # removed by uploader
  106. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
  107. 'only_matching': True,
  108. }, {
  109. # private video
  110. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
  111. 'only_matching': True,
  112. }, {
  113. 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
  114. 'only_matching': True,
  115. }, {
  116. 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
  117. 'only_matching': True,
  118. }]
  119. @staticmethod
  120. def _extract_urls(webpage):
  121. return re.findall(
  122. r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
  123. webpage)
  124. def _extract_count(self, pattern, webpage, name):
  125. return str_to_int(self._search_regex(
  126. pattern, webpage, '%s count' % name, fatal=False))
  127. def _real_extract(self, url):
  128. video_id = self._match_id(url)
  129. self._set_cookie('pornhub.com', 'age_verified', '1')
  130. def dl_webpage(platform):
  131. self._set_cookie('pornhub.com', 'platform', platform)
  132. return self._download_webpage(
  133. 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
  134. video_id, 'Downloading %s webpage' % platform)
  135. webpage = dl_webpage('pc')
  136. error_msg = self._html_search_regex(
  137. r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
  138. webpage, 'error message', default=None, group='error')
  139. if error_msg:
  140. error_msg = re.sub(r'\s+', ' ', error_msg)
  141. raise ExtractorError(
  142. 'PornHub said: %s' % error_msg,
  143. expected=True, video_id=video_id)
  144. # video_title from flashvars contains whitespace instead of non-ASCII (see
  145. # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
  146. # on that anymore.
  147. title = self._html_search_meta(
  148. 'twitter:title', webpage, default=None) or self._search_regex(
  149. (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
  150. r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
  151. r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
  152. webpage, 'title', group='title')
  153. video_urls = []
  154. video_urls_set = set()
  155. subtitles = {}
  156. flashvars = self._parse_json(
  157. self._search_regex(
  158. r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
  159. video_id)
  160. if flashvars:
  161. subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
  162. if subtitle_url:
  163. subtitles.setdefault('en', []).append({
  164. 'url': subtitle_url,
  165. 'ext': 'srt',
  166. })
  167. thumbnail = flashvars.get('image_url')
  168. duration = int_or_none(flashvars.get('video_duration'))
  169. media_definitions = flashvars.get('mediaDefinitions')
  170. if isinstance(media_definitions, list):
  171. for definition in media_definitions:
  172. if not isinstance(definition, dict):
  173. continue
  174. video_url = definition.get('videoUrl')
  175. if not video_url or not isinstance(video_url, compat_str):
  176. continue
  177. if video_url in video_urls_set:
  178. continue
  179. video_urls_set.add(video_url)
  180. video_urls.append(
  181. (video_url, int_or_none(definition.get('quality'))))
  182. else:
  183. thumbnail, duration = [None] * 2
  184. if not video_urls:
  185. tv_webpage = dl_webpage('tv')
  186. assignments = self._search_regex(
  187. r'(var.+?mediastring.+?)</script>', tv_webpage,
  188. 'encoded url').split(';')
  189. js_vars = {}
  190. def parse_js_value(inp):
  191. inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
  192. if '+' in inp:
  193. inps = inp.split('+')
  194. return functools.reduce(
  195. operator.concat, map(parse_js_value, inps))
  196. inp = inp.strip()
  197. if inp in js_vars:
  198. return js_vars[inp]
  199. return remove_quotes(inp)
  200. for assn in assignments:
  201. assn = assn.strip()
  202. if not assn:
  203. continue
  204. assn = re.sub(r'var\s+', '', assn)
  205. vname, value = assn.split('=', 1)
  206. js_vars[vname] = parse_js_value(value)
  207. video_url = js_vars['mediastring']
  208. if video_url not in video_urls_set:
  209. video_urls.append((video_url, None))
  210. video_urls_set.add(video_url)
  211. for mobj in re.finditer(
  212. r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
  213. webpage):
  214. video_url = mobj.group('url')
  215. if video_url not in video_urls_set:
  216. video_urls.append((video_url, None))
  217. video_urls_set.add(video_url)
  218. formats = []
  219. for video_url, height in video_urls:
  220. tbr = None
  221. mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
  222. if mobj:
  223. if not height:
  224. height = int(mobj.group('height'))
  225. tbr = int(mobj.group('tbr'))
  226. formats.append({
  227. 'url': video_url,
  228. 'format_id': '%dp' % height if height else None,
  229. 'height': height,
  230. 'tbr': tbr,
  231. })
  232. self._sort_formats(formats)
  233. video_uploader = self._html_search_regex(
  234. r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
  235. webpage, 'uploader', fatal=False)
  236. view_count = self._extract_count(
  237. r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
  238. like_count = self._extract_count(
  239. r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
  240. dislike_count = self._extract_count(
  241. r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
  242. comment_count = self._extract_count(
  243. r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
  244. page_params = self._parse_json(self._search_regex(
  245. r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
  246. webpage, 'page parameters', group='data', default='{}'),
  247. video_id, transform_source=js_to_json, fatal=False)
  248. tags = categories = None
  249. if page_params:
  250. tags = page_params.get('tags', '').split(',')
  251. categories = page_params.get('categories', '').split(',')
  252. return {
  253. 'id': video_id,
  254. 'uploader': video_uploader,
  255. 'title': title,
  256. 'thumbnail': thumbnail,
  257. 'duration': duration,
  258. 'view_count': view_count,
  259. 'like_count': like_count,
  260. 'dislike_count': dislike_count,
  261. 'comment_count': comment_count,
  262. 'formats': formats,
  263. 'age_limit': 18,
  264. 'tags': tags,
  265. 'categories': categories,
  266. 'subtitles': subtitles,
  267. }
  268. class PornHubPlaylistBaseIE(InfoExtractor):
  269. def _extract_entries(self, webpage):
  270. # Only process container div with main playlist content skipping
  271. # drop-down menu that uses similar pattern for videos (see
  272. # https://github.com/rg3/youtube-dl/issues/11594).
  273. container = self._search_regex(
  274. r'(?s)(<div[^>]+class=["\']container.+)', webpage,
  275. 'container', default=webpage)
  276. return [
  277. self.url_result(
  278. 'http://www.pornhub.com/%s' % video_url,
  279. PornHubIE.ie_key(), video_title=title)
  280. for video_url, title in orderedSet(re.findall(
  281. r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
  282. container))
  283. ]
  284. def _real_extract(self, url):
  285. playlist_id = self._match_id(url)
  286. webpage = self._download_webpage(url, playlist_id)
  287. entries = self._extract_entries(webpage)
  288. playlist = self._parse_json(
  289. self._search_regex(
  290. r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
  291. 'playlist', default='{}'),
  292. playlist_id, fatal=False)
  293. title = playlist.get('title') or self._search_regex(
  294. r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
  295. return self.playlist_result(
  296. entries, playlist_id, title, playlist.get('description'))
  297. class PornHubPlaylistIE(PornHubPlaylistBaseIE):
  298. _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)'
  299. _TESTS = [{
  300. 'url': 'http://www.pornhub.com/playlist/4667351',
  301. 'info_dict': {
  302. 'id': '4667351',
  303. 'title': 'Nataly Hot',
  304. },
  305. 'playlist_mincount': 2,
  306. }, {
  307. 'url': 'https://de.pornhub.com/playlist/4667351',
  308. 'only_matching': True,
  309. }]
  310. class PornHubUserVideosIE(PornHubPlaylistBaseIE):
  311. _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos'
  312. _TESTS = [{
  313. 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
  314. 'info_dict': {
  315. 'id': 'zoe_ph',
  316. },
  317. 'playlist_mincount': 171,
  318. }, {
  319. 'url': 'http://www.pornhub.com/users/rushandlia/videos',
  320. 'only_matching': True,
  321. }, {
  322. # default sorting as Top Rated Videos
  323. 'url': 'https://www.pornhub.com/channels/povd/videos',
  324. 'info_dict': {
  325. 'id': 'povd',
  326. },
  327. 'playlist_mincount': 293,
  328. }, {
  329. # Top Rated Videos
  330. 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
  331. 'only_matching': True,
  332. }, {
  333. # Most Recent Videos
  334. 'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
  335. 'only_matching': True,
  336. }, {
  337. # Most Viewed Videos
  338. 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
  339. 'only_matching': True,
  340. }, {
  341. 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
  342. 'only_matching': True,
  343. }]
  344. def _real_extract(self, url):
  345. user_id = self._match_id(url)
  346. entries = []
  347. for page_num in itertools.count(1):
  348. try:
  349. webpage = self._download_webpage(
  350. url, user_id, 'Downloading page %d' % page_num,
  351. query={'page': page_num})
  352. except ExtractorError as e:
  353. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
  354. break
  355. raise
  356. page_entries = self._extract_entries(webpage)
  357. if not page_entries:
  358. break
  359. entries.extend(page_entries)
  360. return self.playlist_result(entries, user_id)