mirror of
https://github.com/ChronosX88/medved.git
synced 2024-11-22 22:52:19 +00:00
61 lines
2.0 KiB
Python
61 lines
2.0 KiB
Python
from lib.exec import Task
|
|
|
|
from io import BytesIO
|
|
import json
|
|
import time
|
|
|
|
import bs4
|
|
import requests
|
|
import urllib3
|
|
from PIL import Image
|
|
from bson.binary import Binary
|
|
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
from selenium.webdriver.common.proxy import Proxy, ProxyType
|
|
import zlib
|
|
import netaddr
|
|
|
|
|
|
class HTTPFindTask(Task):
|
|
def __init__(self, id, root):
|
|
super().__init__(id, root)
|
|
|
|
def _process(self, item):
|
|
urllib3.disable_warnings()
|
|
response = requests.get(url='http://%s:%s/' % (self._host['ip'], self._host['port']),
|
|
timeout=cnf.stalker.HTTP.timeout,
|
|
verify=False)
|
|
|
|
if response.status_code in [400, 401, 403, 500]:
|
|
raise self.PipelineError("Bad response")
|
|
|
|
self._host['data']['response'] = {}
|
|
self._host['data']['response']['code'] = response.status_code
|
|
self._host['data']['response']['text'] = response.text
|
|
self._host['data']['response']['content'] = response.content
|
|
self._host['data']['response']['encoding'] = response.encoding
|
|
self._host['data']['response']['headers'] = response.headers
|
|
|
|
encoding = response.encoding if 'charset' in response.headers.get('content-type', '').lower() else None
|
|
soup = bs4.BeautifulSoup(response.content, "html.parser", from_encoding=encoding)
|
|
if soup.original_encoding != 'utf-8':
|
|
meta = soup.select_one('meta[charset], meta[http-equiv="Content-Type"]')
|
|
if meta:
|
|
if 'charset' in meta.attrs:
|
|
meta['charset'] = 'utf-8'
|
|
else:
|
|
meta['content'] = 'text/html; charset=utf-8'
|
|
self._host['data']['response']['text'] = soup.prettify() # encodes to UTF-8 by default
|
|
|
|
title = soup.select_one('title')
|
|
if title:
|
|
if title.string:
|
|
title = title.string
|
|
else:
|
|
title = ""
|
|
else:
|
|
title = ""
|
|
|
|
self._host['data']['title'] = title
|