source: titan/mediathek/localhoster/lib/fx_gmu.py @ 41385

Last change on this file since 41385 was 40037, checked in by obi, 7 years ago

[tithek] hoster fix flashx

File size: 8.0 KB
Line 
1"""
2flashx.tv urlresolver plugin
3Copyright (C) 2015 tknorris
4
5This program is free software: you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation, either version 3 of the License, or
8(at your option) any later version.
9
10This program is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program. If not, see <http://www.gnu.org/licenses/>.
17"""
18import re
19import urlparse
20import urllib
21from lib import jsunpack
22from lib import unwise
23from lib import helpers
24#from urlresolver import common
25#from urlresolver.resolver import ResolverError
26from net import Net
27import common
28
29SORT_KEY = {'High': 3, 'Middle': 2, 'Low': 1}
30#net = common.Net()
31net = Net()
32
33BAD_ETAGS = ["580eabc0-40ac3a", "582e7b99-9cd574b"]
34MIN_CONTENT = 1024 * 1024 * 5
35
36def get_media_url(url):
37    try:
38#        print "xxxxxxxx"
39        hostname = urlparse.urlparse(url).hostname
40        media_id = re.search('embed.php\?c=(.*)', url).group(1)
41        headers = {'User-Agent': common.IE_USER_AGENT}
42        html = net.http_GET(url, headers=headers).content
43        adblock_check(html, headers)
44        headers.update({'Referer': url})
45#        print "yyyyyyyy"
46
47        for js_url in get_js_url(html):
48#           print "xxxxxxxx111"
49            js = get_js(js_url, headers, hostname)
50            js = unwise.unwise_process(js)
51            adblock_check(js, headers)
52            xhr_check(js, headers)
53
54        html = cleanse_html(html)
55        for match in re.finditer('''href=['"]([^'"]+)''', html):
56#            print "xxxxxxxx2222"
57            playvid_url = match.group(1)
58            if '-%s.' % (media_id) in playvid_url:
59                headers.update({'Referer': url})
60                html = net.http_GET(playvid_url, headers=headers).content
61#                common.log_utils.log(html)
62                html = cleanse_html(html)
63                headers['Referer'] = playvid_url
64       
65        sources = []
66        spans = get_span_ids(html, media_id)
67        for match in re.finditer('<script[^>]*>\s*(eval\(function.*?)</script>', html, re.DOTALL):
68 #           print "xxxxxxxx3333"
69            js_data = jsunpack.unpack(match.group(1))
70            if not spans or any(span_id in js_data for span_id in spans):
71                js_sources = helpers.parse_sources_list(js_data)
72                sources += js_sources
73           
74        d = {}
75        for source in sources: d[source[1]] = d.setdefault(source[1], 0) + 1
76#        print "xxxxxxxx4444"
77#        common.log_utils.log(sources)
78        sources = [source for source in sources if d[source[1]] == 1]
79#        print "xxxxxxxx5555"
80#        common.log_utils.log(sources)
81        sources = [source for source in sources if not any([x in source[1].lower() for x in ('/movie.mp4', '/trailer.mp4', '://cdn.flashx.tv')])]
82#        print "xxxxxxxx6666"
83#        print "source", source
84#        print "headers", headers
85
86       
87#        common.log_utils.log(sources)
88#        sources = [source for source in sources if check_headers(source, headers)]
89#        print "xxxxxxxx7777"
90#        common.log_utils.log(sources)
91        try:
92             sources.sort(key=lambda x: SORT_KEY.get(x[0], 0), reverse=True)
93#             print "xxxxxxxx8888"
94        except:
95             pass
96#             print "xxxxxxxx9999"
97        source = helpers.pick_source(sources)
98#        print "xxxxxxxxxxxxxxxxxxxxxxxxxx"
99        return source + helpers.append_headers(headers)
100       
101    except Exception as e:
102#        common.log_utils.log_debug('Exception during flashx resolve parse: %s' % e)
103        raise
104   
105    raise ResolverError('Unable to resolve flashx link. Filelink not found.')
106
107def cleanse_html(html):
108    for match in re.finditer('<!--.*?(..)-->', html, re.DOTALL):
109        if match.group(1) != '//': html = html.replace(match.group(0), '')
110   
111    html = re.sub('''<(div|span)[^>]+style=["'](visibility:\s*hidden|display:\s*none);?["']>.*?</\\1>''', '', html, re.I | re.DOTALL)
112    return html
113
114def get_span_ids(html, media_id):
115    spans = []
116    pattern = '''<img[^>]+src=['"][^"']+%s.jpe?g''' % (media_id)
117    for span in get_dom(html, 'span'):
118        match = re.search('''<span[^>]+id=['"]([^'"]+)[^>]+>(.*)''', span, re.I | re.DOTALL)
119        if match:
120            if re.search(pattern, match.group(2), re.I | re.DOTALL):
121                spans.append(match.group(1))
122   
123    return spans
124   
125def get_dom(html, tag):
126    start_str = '<%s' % (tag.lower())
127    end_str = '</%s' % (tag.lower())
128   
129    results = []
130    html = html.lower()
131    while html:
132        start = html.find(start_str)
133        end = html.find(end_str, start)
134        pos = html.find(start_str, start + 1)
135        while pos < end and pos != -1:
136            tend = html.find(end_str, end + len(end_str))
137            if tend != -1: end = tend
138            pos = html.find(start_str, pos + 1)
139       
140        if start == -1 and end == -1:
141            break
142        elif start > -1 and end > -1:
143            result = html[start:end]
144        elif end > -1:
145            result = html[:end]
146        elif start > -1:
147            result = html[start:]
148        else:
149            break
150           
151        results.append(result)
152        html = html[start + len(start_str):]
153   
154    return results
155   
156def adblock_check(js, headers):
157    match = re.search('''!=\s*null.*?get\(['"]([^'"]+).*?\{([^:]+)\s*:\s*['"]([^'"]+)''', js, re.DOTALL)
158    if match:
159        fx_url, fx_param, fx_value = match.groups()
160        fx_url = resolve_url(urlparse.urljoin('https://www.flashx.tv', fx_url) + '?' + urllib.urlencode({fx_param: fx_value}))
161#        common.log_utils.log('fxurl: %s' % (fx_url))
162        _html = net.http_GET(fx_url, headers=headers).content
163
164def xhr_check(js, headers):
165    match = re.search('''request\.open\(\s*["']([^'"]+)"\s*,\s*['"]([^'"]+)''', js, re.DOTALL)
166    if match:
167        _method, xhr_url = match.groups()
168#        common.log_utils.log('xhr url: %s' % (xhr_url))
169        _html = net.http_GET(xhr_url, headers=headers).content
170   
171def check_headers(source, headers):
172    try:
173        response = net.http_HEAD(source[1], headers=headers)
174        res_headers = response.get_headers(as_dict=True)
175#        common.log_utils.log(res_headers)
176        if res_headers.get('Etag', '').strip('"') not in BAD_ETAGS and int(res_headers.get('Content-Length', 0)) >= MIN_CONTENT:
177            return True
178    except Exception as e:
179#        common.log_utils.log('Adding failed source: %s' % (e), common.log_utils.LOGWARNING)
180        return True
181    return False
182
183def get_js_url(html):
184    urls = []
185    for match in re.finditer('''<script[^>]*src\s*=\s*(["'])(.*?)\\1''', html, re.I):
186        js_url = match.group(2).strip()
187        js_url = re.sub('''['"]''', '', js_url)
188        if '/' not in js_url:
189            js_url = js_url.strip('+')
190            pattern = '''var\s+%s\s*=\s*(['"])(.*?)\\1''' % (js_url)
191            match = re.search(pattern, html)
192            if match:
193                js_url = match.group(2)
194        urls.append(js_url)
195    return urls
196   
197def get_js(js_url, headers, hostname):
198    js = ''
199    if js_url.startswith('//'):
200        js_url = 'https:' + js_url
201    elif not js_url.startswith('http'):
202        base_url = 'https://' + hostname
203        js_url = urlparse.urljoin(base_url, js_url)
204   
205#    common.log_utils.log('Getting JS: |%s| - |%s|' % (js_url, headers))
206    try: js = net.http_GET(js_url, headers=headers).content
207    except: js = ''
208    return js
209   
210def resolve_url(url):
211    parts = list(urlparse.urlsplit(url))
212    segments = parts[2].split('/')
213    segments = [segment + '/' for segment in segments[:-1]] + [segments[-1]]
214    resolved = []
215    for segment in segments:
216        if segment in ('../', '..'):
217            if resolved[1:]:
218                resolved.pop()
219        elif segment not in ('./', '.'):
220            resolved.append(segment)
221    parts[2] = ''.join(resolved)
222    return urlparse.urlunsplit(parts)
Note: See TracBrowser for help on using the repository browser.