1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
|
""" @File : cdn_data_dns.py @Time : 2023/08/21 21:46:47 @Author : Li Ruilong @Version : 1.0 @Contact : liruilonger@gmail.com @Desc : 省市CDN 节点IP数据获取 """
from seleniumwire import webdriver import json import time from selenium.webdriver.common.by import By import pandas as pd import re
ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
driver = webdriver.Chrome() with open('C:\\Users\山河已无恙\\Documents\GitHub\\reptile_demo\\demo\\cookie.txt', 'r', encoding='u8') as f: cookies = json.load(f)
driver.get('https://cdn.chinaz.com/') for cookie in cookies: driver.add_cookie(cookie)
driver.get('https://cdn.chinaz.com/')
time.sleep(6)
CDN_Manufacturer = [] new_div_element = driver.find_element(By.CSS_SELECTOR, ".toplist-main") div_elements = new_div_element.find_element(By.CSS_SELECTOR, ".ullist") div_cdn = div_elements.find_elements(By.XPATH,"//a[contains(@href,'server')]")
current_window_1 = driver.current_window_handle for i,mdn_ms in enumerate(div_cdn): try: ip_addresse = [] print(mdn_ms.text) cloud_cdn_name = mdn_ms.text mdn_ms.click() time.sleep(2) driver.switch_to.window(driver.window_handles[-1]) driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2)") time.sleep(5) areas_list = ["安徽", "河北", "河南", "湖北", "湖南", "江西", "陕西", "山西", "四川", "重庆"] for a in areas_list: areas = driver.find_element(By.CSS_SELECTOR,"#areas") nmg = areas.find_element(By.XPATH,"//a/font[contains(text(),'"+a+"')]") nmg.click() time.sleep(2) new_div_element = driver.find_element(By.CSS_SELECTOR, ".box") new_table_element = str(new_div_element.text).split("\n") ip_addresses = re.findall(ip_pattern, str(new_table_element)) ip_addresse.extend(ip_addresses)
if len(driver.find_elements(By.XPATH,"//a[contains(@title, '尾页')]")) < 2: ips = {} ips[cloud_cdn_name] = ip_addresse df = pd.DataFrame(ips) df.to_csv('CDN_M_省份_'+a +'_'+cloud_cdn_name+'.csv', index=False) print("单页数据,数据已保存为CSV文件",'CDN_M_'+a +'_'+cloud_cdn_name+'.csv') continue sum_page = driver.find_element(By.XPATH,"//a[contains(@title, '尾页')]") attribute_value = sum_page.get_attribute('val') print(attribute_value) current_window_2 = driver.current_window_handle for page in range(1,int(attribute_value)): try: next_page = driver.find_element(By.XPATH,"//a[contains(@title, '下一页')]") next_page.click() time.sleep(5) new_div_element = driver.find_element(By.CSS_SELECTOR, ".box") new_table_element = str(new_div_element.text).split("\n") ip_addresses = re.findall(ip_pattern, str(new_table_element)) ip_addresse.extend(ip_addresses) except: print(a,cloud_cdn_name,"没有IP") time.sleep(5) pass continue ips = {} ips[cloud_cdn_name] = ip_addresse df = pd.DataFrame(ips) df.to_csv('CDN_M_省份_'+a+'_'+cloud_cdn_name+'.csv', index=False) print("数据已保存为CSV文件",' CDN_M_省份_'+a+'_'+cloud_cdn_name+'.csv') except: print(cloud_cdn_name,"没有IP") pass continue finally: pass driver.close() driver.switch_to.window(current_window_1) continue
|