1. Description
Encrypted parameters are encountered when crawling a certain website, due to the js code is not good to crack after obfuscated compilation, so we use selenium to get the parameters, but we get selenium data basically based on the page, for the asynchronous request initiated by the website, we can extract from the logs
2. Setting driver parameters
We first have to set up monitoring browser logs via an Option object (ChromeOptions, for example), which in older versions of Selenium was set up via DesiredCapabilities, and here’s how it’s written in newer versions
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
options = ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--disable-single-click-autofill")
options.add_argument("--disable-autofill-keyboard-accessory-view[8]")
options.add_argument("--disable-full-form-autofill-ios")
options.add_experimental_option('perfLoggingPrefs', {
'enableNetwork': True,
'enablePage': False,
})
options.set_capability("goog:loggingPrefs", {
'browser': 'ALL',
'performance': 'ALL',
})
options.set_capability("goog:perfLoggingPrefs", {
'enableNetwork': True,
'enablePage': False,
'enableTimeline': False
})
3. Requesting a Web page
Now instantiate a driver that initiates a web request, I’m using theWebDriverWait
explicitly wait for an element to appear, you can also implicitly wait or just sleep, if you do not wait, asynchronous request is not loaded before the start of the fetch, you may not get the desired data
service = Service(executable_path=executable_path)
driver = Chrome(service=service, options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
driver.get(page_url)
wait = WebDriverWait(driver, 15, 0.5)
try:
wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item ")))
except Exception as e:
print("WebDriverWait.until timeout error: {}".format(e))
html = driver.execute_script("return document.documentElement.outerHTML")
4. Processing logs
Have a look at the driver’slog_types
attribute to get all the log types, traversing it, passing theget_log()
method to get the corresponding logs, and then filter out the logs you want.
For example, here I am filtering out allNetwork.requestWillBeSent
log, i.e. the data for sending an asynchronous request, because I need the request header for that request, and if it’s a response type log (Network.responseReceived
), which contains only response headers. The specific types supported can be found in theDocumentation from Google devtools
If you need to filter out Ajax (XHR) requests, you can do so based on the type in the log’s params, or you can judge it by the
sign_dict = dict() # Use it to store the data you want
for log_type in driver.log_types:
perf_list = driver.get_log(log_type)
for row_log in perf_list:
try:
log_json = json.loads(row_log['message'])
message_log = log_json['message']
except Exception as e:
print(e)
continue
if message_log.get('method') != 'Network.requestWillBeSent':
continue
if message_log.get("params", {}).get("type", "").upper() != "XHR":
continue
headers = message_log['params'].get('request', {}).get('headers')
if not headers:
continue
x_sign = headers.get('X-Sign')
if not x_sign:
continue
x_app_id = headers.get('X-AppID')
x_ts = headers.get('X-Ts')
print("success:", x_sign, x_app_id, x_ts)
req_url = message_log['params'].get('request', {}).get('url')
key = os.path.split(req_url.split("?")[0])[1]
sign_dict[key] = {"X-AppID": x_app_id, "X-Sign": x_sign, "X-Ts": x_ts}
Note that if you want the response bodyNetwork.responseReceived
The log of typeresponse
field is not responsive, you need to pass theparams
field in therequestId
Get, the reference code is as follows
res_body_dict = dict()
for log_type in driver.log_types:
perf_list = driver.get_log(log_type)
for row_log in perf_list:
try:
log_json = json.loads(row_log['message'])
message_log = log_json['message']
except Exception as e:
print(e)
continue
if message_log.get('method') != 'Network.responseReceived':
continue
if message_log.get("params", {}).get("type", "").upper() != "XHR":
continue
request_id = message_log['params'].get("requestId")
if not request_id:
continue
req_url = message_log['params'].get('response', {}).get('url')
key = os.path.split(req_url.split("?")[0])[1]
content = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
body = None
try:
body = json.loads(content["body"])
except Exception as e:
print("get_unisat_data_by_selenium() json loads error: {}, content:{}".format(e, content))
res_body_dict[key] = body
5. Complete Code
The complete reference code above is as follows
import json
import os.path
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
def get_selenium_driver(executable_path=r"E:\webdriver\chromedriver.exe"):
options = ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--disable-single-click-autofill")
options.add_argument("--disable-autofill-keyboard-accessory-view[8]")
options.add_argument("--disable-full-form-autofill-ios")
options.add_experimental_option('perfLoggingPrefs', {
'enableNetwork': True,
'enablePage': False,
})
options.set_capability("goog:loggingPrefs", {
'browser': 'ALL',
'performance': 'ALL',
})
options.set_capability("goog:perfLoggingPrefs", {
'enableNetwork': True,
'enablePage': False,
'enableTimeline': False
})
service = Service(executable_path=executable_path)
driver = Chrome(service=service, options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
return driver
def get_sign_by_selenium(page_url):
driver = get_selenium_driver()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
driver.get(page_url)
wait = WebDriverWait(driver, 15, 0.5)
try:
wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item ")))
except Exception as e:
print("WebDriverWait.until timeout error: {}".format(e))
# html = driver.execute_script("return document.documentElement.outerHTML")
# with open(r"C:\Users\admin\Desktop\test\test.html", "w") as f:
# f.write(html)
# time.sleep(10)
sign_dict = dict()
for log_type in driver.log_types:
perf_list = driver.get_log(log_type)
for row_log in perf_list:
try:
log_json = json.loads(row_log['message'])
message_log = log_json['message']
except Exception as e:
print(e)
continue
if message_log.get('method') != 'Network.requestWillBeSent':
continue
if message_log.get("params", {}).get("type", "").upper() != "XHR":
continue
headers = message_log['params'].get('request', {}).get('headers')
if not headers:
continue
x_sign = headers.get('X-Sign')
if not x_sign:
continue
x_app_id = headers.get('X-AppID')
x_ts = headers.get('X-Ts')
print("success:", x_sign, x_app_id, x_ts)
req_url = message_log['params'].get('request', {}).get('url')
key = os.path.split(req_url.split("?")[0])[1]
sign_dict[key] = {"X-AppID": x_app_id, "X-Sign": x_sign, "X-Ts": x_ts}
return sign_dict
def get_unisat_data_by_selenium(page_url):
driver = get_selenium_driver()
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument",
{"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""})
driver.get(page_url)
wait = WebDriverWait(driver, 15, 0.5)
try:
wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item ")))
except Exception as e:
print("WebDriverWait.until timeout error: {}".format(e))
res_body_dict = dict()
for log_type in driver.log_types:
perf_list = driver.get_log(log_type)
for row_log in perf_list:
try:
log_json = json.loads(row_log['message'])
message_log = log_json['message']
except Exception as e:
print(e)
continue
if message_log.get('method') != 'Network.responseReceived':
continue
if message_log.get("params", {}).get("type", "").upper() != "XHR":
continue
request_id = message_log['params'].get("requestId")
if not request_id:
continue
req_url = message_log['params'].get('response', {}).get('url')
key = os.path.split(req_url.split("?")[0])[1]
content = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id})
body = None
try:
body = json.loads(content["body"])
except Exception as e:
print("get_unisat_data_by_selenium() json loads error: {}, content:{}".format(e, content))
res_body_dict[key] = body
return res_body_dict
if __name__ == '__main__':
url = "https://unisat.io/brc20?q=bc1pkmnh3nj89uns3yp2mtqqxjns65vy6ca6n5jvp4s8ua8nke69cnjs987vtp"
print("get_sign_by_selenium(url):", get_sign_by_selenium(url))
# print("get_unisat_data_by_selenium(url):", get_unisat_data_by_selenium(url))
P.S. For more information on the use of selenium you can refer to the previous article
【Testing】Selenium’s use (common attribute methods, element waiting, manipulating cookies, manipulating elements, headless mode, getting HTML source code)
Test] selenium anti-climbing operation
Modify selenium option configuration parameters to optimize performance.
Using selenium on Linux (CentOS, Ubuntu) interface-less servers.
[Testing] Selenium Cookie Manipulation