tsd的文章

  • python爬取数据时遭遇动态滚动条

    from selenium import webdriverfrom selenium.webdriver.chrome.options import Optionsimport time# 创建chrome浏览器驱动,无头模式chrome_options = Options()# chrome_options.add_argument('--headless')chrome_options.add_argument("--start-maximized");driver = webdriver.Chrome("D://googleDever//chromedriver.exe",chrome_options=chrome_options)# 加载界面driver.get("https://wenku.baidu.com/search?lm=0&od=0&ie=utf-8&word=csdn")time.sleep(3)# 获取页面初始高度js = "return action=document.body.scrollHeight"height = driver.execute_script(js)# 将滚动条调整至页面底部driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')time.sleep(5)#定义初始时间戳(秒)t1 = int(time.time())#定义循环标识,用于终止while循环status = True# 重试次数num=0while status: # 获取当前时间戳(秒) t2 = int(time.time()) # 判断时间初始时间戳和当前时间戳相差是否大于30秒,小于30秒则下拉滚动条 if t2-t1 < 30: new_height = driver.execute_script(js) if new_height > height : time.sleep(1) driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') # 重置初始页面高度 height = new_height # 重置初始时间戳,重新计时 t1 = int(time.time()) elif num < 3: # 当超过30秒页面高度仍然没有更新时,进入重试逻辑,重试3次,每次等待30秒 time.sleep(3) num = num+1 else: # 超时并超过重试次数,程序结束跳出循环,并认为页面已经加载完毕! print("滚动条已经处于页面最下方!") status = False # 滚动条调整至页面顶部 driver.execute_script('window.scrollTo(0, 0)') break# 打印页面源码content = driver.page_sourceprint(content)
    0  留言 2021-06-10 10:05:31
  • python爬取双色球数据更新及数据库使用


    解析网站并获取数据
    # 彩票数据所在的urlurl = 'http://datachart.500.com/ssq/'# 提取数据response = requests.get(url, headers={"User-Agent": UserAgent().chrome})# 通过xpath去解析e = etree.HTML(response.text)date_times = e.xpath('//tbody[@id="tdata"]/tr/td[1]/text()')trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]')
    链接数据库
    # 链接数据库client = pymysql.connect(host='localhost', port=3306, user='root', password='123456', charset='utf8', db='bangumi')cursor = client.cursor()
    普通获取数据
    # 插入数据的sqlsql = 'insert into doubleballs values(0,%s,%s,%s)'for data_time, tr in zip(date_times, trs): red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()')) blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0] print("第" + data_time + "红球是:" + red_ball + " 蓝球:" + blue_ball) cursor.execute(sql, [data_time, red_ball, blue_ball]) client.commit()
    更新数据这部分从数据库中获取数据,然后反转顺序,index作为计数器,循环遍历假如有新数据要更新,那result返回值为0,index+1。
    # 查看数据是否存在select_new_sql = "select * from doubleballs where date_time = %s"date_times.reverse()# 记录有多少条新数据index = 0for data_time in date_times: result = cursor.execute(select_new_sql, [data_time]) # 判断数据是否存在 if result == 1: break index+=1
    数据顺序反转,按照网站上xpath写法获取第1个数据,即最新的数据放到数据库中。
    # 数据从新到旧排序trs.reverse()for i in range(index): # 提取红球 red_ball = '-'.join(trs[i].xpath('./td[@class="chartBall01"]/text()')) # 提取蓝球 blue_ball = trs[i].xpath('./td[@class="chartBall02"]/text()')[0] print("第" + date_times[i] + "红球是:" + red_ball + " 蓝球:" + blue_ball) cursor.execute(sql, [date_times[i], red_ball, blue_ball]) client.commit()
    完整代码
    import requestsfrom fake_useragent import UserAgentfrom lxml import htmlimport pymysqletree = html.etree# 彩票数据所在的urlurl = 'http://datachart.500.com/ssq/'# 提取数据response = requests.get(url, headers={"User-Agent": UserAgent().chrome})# 通过xpath去解析e = etree.HTML(response.text)date_times = e.xpath('//tbody[@id="tdata"]/tr/td[1]/text()')trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]')# 链接数据库client = pymysql.connect(host='localhost', port=3306, user='root', password='123456', charset='utf8', db='bangumi')cursor = client.cursor()# 插入数据的sqlsql = 'insert into doubleballs values(0,%s,%s,%s)'# 查看数据是否存在select_new_sql = "select * from doubleballs where date_time = %s"date_times.reverse()# 记录有多少条新数据index = 0for data_time in date_times: reslut = cursor.execute(select_new_sql, [data_time]) # 判断数据是否存在 if reslut == 1: break index += 1# 数据从新到旧排序trs.reverse()for i in range(index): # 提取红球 red_ball = '-'.join(trs[i].xpath('./td[@class="chartBall01"]/text()')) # 提取蓝球 blue_ball = trs[i].xpath('./td[@class="chartBall02"]/text()')[0] print("第" + date_times[i] + "红球是:" + red_ball + " 蓝球:" + blue_ball) cursor.execute(sql, [date_times[i], red_ball, blue_ball]) client.commit()# for data_time, tr in zip(date_times, trs):# red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()'))# blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0]# print("第" + data_time + "红球是:" + red_ball + " 蓝球:" + blue_ball)# cursor.execute(sql, [data_time, red_ball, blue_ball])# client.commit()cursor.close()client.close()
    这样排序会让更新之后最新的在数据最后,但是一开始排序的时候不会出现问题,是从新到旧的排序。于是因为有点强迫症,最后还是改成从旧到新排序。只需要改动几行代码即可,反转数据在提取红球蓝球数据前,保证数据最新,判断时每次存在就让index+1,之后加入index = count - index其中count = data_time.__len__()。接着将循环中的i变成index-i-1。最终代码如下,
    import requestsfrom fake_useragent import UserAgentfrom lxml import htmlimport pymysqletree = html.etree# 彩票数据所在的urlurl = 'http://datachart.500.com/ssq/'# 提取数据response = requests.get(url, headers={"User-Agent": UserAgent().chrome})# 通过xpath去解析e = etree.HTML(response.text)date_times = e.xpath('//tbody[@id="tdata"]/tr/td[1]/text()')trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]')count =date_times.__len__()# 链接数据库client = pymysql.connect(host='localhost', port=3306, user='root', password='123456', charset='utf8', db='bangumi')cursor = client.cursor()# 插入数据的sqlsql = 'insert into doubleballs values(0,%s,%s,%s)'# 查看数据是否存在select_new_sql = "select * from doubleballs where date_time = %s"# date_times.reverse()# 记录有多少条新数据index = 0for data_time in date_times: reslut = cursor.execute(select_new_sql, [data_time]) # 判断数据是否存在 if reslut == 1: index += 1index = count - index# 数据从新到旧排序# trs.reverse()date_times.reverse()trs.reverse()for i in range(index): # 提取红球 red_ball = '-'.join(trs[index-i-1].xpath('./td[@class="chartBall01"]/text()')) # 提取蓝球 blue_ball = trs[index-i-1].xpath('./td[@class="chartBall02"]/text()')[0] print("第" + date_times[index-i-1] + "红球是:" + red_ball + " 蓝球:" + blue_ball) cursor.execute(sql, [date_times[index-i-1], red_ball, blue_ball]) client.commit()# for data_time, tr in zip(date_times, trs):# red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()'))# blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0]# print("第" + data_time + "红球是:" + red_ball + " 蓝球:" + blue_ball)# cursor.execute(sql, [data_time, red_ball, blue_ball])# client.commit()cursor.close()client.close()
    1  留言 2021-06-09 13:49:20
eject