こんにちは、ミナピピン(@python_mllover)です!
今回はPythonでマッチングアプリのプロフィール画像をスクレイピングで自動的に取得したいと思います。スクレイピングに使用するのはTinderというマッチングサイトです。
マッチングアプリのプロフィール画像をスクレイピングで自動的に取得する
認証はGoogleアカウントで行っています。いいねを押して進めたいのですが、うまく押せないパターンがあるので更新してプロフ情報を切り替えています。
from selenium import webdriver import time from selenium.webdriver.chrome.options import Options import requests from bs4 import BeautifulSoup import urllib from pathlib import Path import uuid import pandas as pd import re from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.desired_capabilities import DesiredCapabilities import chromedriver_binary import subprocess cmd = 'pip install --upgrade chromedriver_binary' res = subprocess.call(cmd, shell=True) chrome_options = webdriver.ChromeOptions() prefs = {"profile.default_content_setting_values.notifications" : 2} chrome_options.add_experimental_option("prefs",prefs) driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options) url="https://tinder.com" driver.get(url) time.sleep(2) driver.maximize_window() btn=driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div/div/header/div/div[2]/div[2]/a") btn.click()# time.sleep(2) btn=driver.find_element_by_xpath("/html/body/div[2]/div/div/div[1]/div/div[3]/span/div[2]/button/span[1]") btn.click()#facebook time.sleep(4) mail="メールアドレス" pass_="パスワード" handle_array = driver.window_handles driver.switch_to.window(handle_array[-1])#一番手前のwindowにスイッチ time.sleep(3) mail_input=driver.find_element_by_xpath("/html/body/div/div[2]/div[1]/form/div/div[1]/div/input") mail_input.send_keys(mail)#メアドの自動入力 pass_input=driver.find_element_by_xpath("/html/body/div/div[2]/div[1]/form/div/div[2]/div/input") pass_input.send_keys(pass_)#パスワードの自動入力 btn=driver.find_element_by_xpath("/html/body/div/div[2]/div[1]/form/div/div[3]/label[2]") btn.click()#ログイン押下 time.sleep(5) try: btn=driver.find_element_by_xpath("/html/body/div/div/div/form/div[2]/div/div[2]/div[1]/div[2]/div[1]/button") btn.click()#ログイン押下 time.sleep(3) except: pass driver.switch_to.window(handle_array[0])#windowを最初のものにスイッチ time.sleep(8) try: btn=driver.find_element_by_xpath("/html/body/div[1]/div/div[2]/div/div/div[1]/button") btn.click()#クッキーの詳細設定についてのポップアップ,はいを押下 except: pass time.sleep(3) try: driver.switch_to.window(handle_array[0])#windowを最初のものにスイッチ btn = driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div[3]/button[2]") btn.click()# LIKEがある場合は後で except: pass time.sleep(3) try: driver.switch_to.window(handle_array[0])#windowを最初のものにスイッチ btn = driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div[3]/button[2]") btn.click()# LIKEがある場合は後で except: pass time.sleep(10) #プロフィールにアクセス try: btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[1]/span/a/svg/g/circle") btn.click() except: pass for i in range(5): try: name = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div/div/div[3]/div[3]/div/div[1]/div/div/span").text except: try: name = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[3]/div[3]/div/div[2]/div/div").text except: name = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[2]/div[1]/div/div[1]/div/h1").text try: year = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div/div/div[3]/div[3]/div/div[1]/div/span[2]").text except: try: year = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[3]/div[3]/div/div[1]/div/span[2]").text except: year = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[2]/div[1]/div/div[1]/span").text print(name, year) try: profile_text = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div/div/div[3]/div[3]/div/div[2]/div/div").text except: try: profile_text = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[3]/div[3]/div/div[2]/div/div").text except: profile_text = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]").text print(profile_text) html = driver.page_source #print('ソース', html) soup = BeautifulSoup(html, "html.parser") try: item_lists = soup.find_all('div', class_="Bdrs(8px) Bgz(cv) Bgp(c) StretchedBox")[0] except: item_lists = soup.find_all('div', class_="profileCard__slider__img Z(-1)")[0] print(item_lists.get("style")) p = r'\(([^*]*)\)' photo_url = re.findall(p, str(item_lists.get("style")))[0][1:-1] print(photo_url) response = requests.get(photo_url) # サンプルファイルをダウンロードする with open(f'{name}_{year}_profile_1.jpg', 'wb') as saveFile: saveFile.write(response.content) # 詳細ボタンをクリック try: btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[3]/div[3]/button") btn.click() html = driver.page_source soup = BeautifulSoup(html, "html.parser") tag_lists = soup.find_all('div', class_="Bdrs(100px) Bd D(ib) Va(m) Fz($xs) Mend(8px) Mb(8px) Px(8px) Py(4px) Bdc($c-secondary) C($c-secondary)")[0].text print(tag_lists) except Exception as e: print(e.args) try: btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[1]/span/a/svg") btn.click() html = driver.page_source soup = BeautifulSoup(html, "html.parser") tag_lists = soup.find_all('div', class_="Bdrs(100px) Bd D(ib) Va(m) Fz($xs) Mend(8px) Mb(8px) Px(8px) Py(4px) Bdc($c-secondary) C($c-secondary)")[0].text print(tag_lists) except Exception as e: print(e.args) # いいね!する try: btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div/div/div[4]/div/div[4]/button/span/span") btn.click() except Exception as e: try: btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[2]/div/div/div[4]/button/span/span/svg/path") btn.click() except Exception as e: try: btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[4]/div/div[4]/button/span/span/svg") btn.click() except: pass finally: driver.refresh() time.sleep(5) time.sleep(5)
コメント