【Python】マッチングアプリのプロフィール画像をスクレイピングで自動的に大量取得する

こんにちは、ミナピピン(@python_mllover)です！

今回はPythonでマッチングアプリのプロフィール画像をスクレイピングで自動的に取得したいと思います。スクレイピングに使用するのはTinderというマッチングサイトです。

マッチングアプリのプロフィール画像をスクレイピングで自動的に取得する

認証はGoogleアカウントで行っています。いいねを押して進めたいのですが、うまく押せないパターンがあるので更新してプロフ情報を切り替えています。

from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
import requests
from bs4 import BeautifulSoup
import urllib
from pathlib import Path
import uuid
import pandas as pd
import re
from webdriver_manager.chrome import ChromeDriverManager 
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import chromedriver_binary 
import subprocess

cmd = 'pip install --upgrade chromedriver_binary' 
res = subprocess.call(cmd, shell=True) 

chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome(ChromeDriverManager().install(), chrome_options=chrome_options) 


url="https://tinder.com"
driver.get(url)
time.sleep(2)
driver.maximize_window()
btn=driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div/div/header/div/div[2]/div[2]/a")
btn.click()#

time.sleep(2)

btn=driver.find_element_by_xpath("/html/body/div[2]/div/div/div[1]/div/div[3]/span/div[2]/button/span[1]")
btn.click()#facebook

time.sleep(4)

mail="メールアドレス"
pass_="パスワード"

handle_array = driver.window_handles
driver.switch_to.window(handle_array[-1])#一番手前のwindowにスイッチ
time.sleep(3)

mail_input=driver.find_element_by_xpath("/html/body/div/div[2]/div[1]/form/div/div[1]/div/input")
mail_input.send_keys(mail)#メアドの自動入力

pass_input=driver.find_element_by_xpath("/html/body/div/div[2]/div[1]/form/div/div[2]/div/input")
pass_input.send_keys(pass_)#パスワードの自動入力

btn=driver.find_element_by_xpath("/html/body/div/div[2]/div[1]/form/div/div[3]/label[2]")
btn.click()#ログイン押下
time.sleep(5)

try:
    btn=driver.find_element_by_xpath("/html/body/div/div/div/form/div[2]/div/div[2]/div[1]/div[2]/div[1]/button")
    btn.click()#ログイン押下
    time.sleep(3)
except:
    pass


driver.switch_to.window(handle_array[0])#windowを最初のものにスイッチ
time.sleep(8)


try:
    btn=driver.find_element_by_xpath("/html/body/div[1]/div/div[2]/div/div/div[1]/button")
    btn.click()#クッキーの詳細設定についてのポップアップ，はいを押下
except:
    pass

time.sleep(3)

try:
    driver.switch_to.window(handle_array[0])#windowを最初のものにスイッチ
    btn = driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div[3]/button[2]")
    btn.click()# LIKEがある場合は後で
except:
    pass

time.sleep(3)

try:
    driver.switch_to.window(handle_array[0])#windowを最初のものにスイッチ
    btn = driver.find_element_by_xpath("/html/body/div[2]/div/div/div/div[3]/button[2]")
    btn.click()# LIKEがある場合は後で
except:
    pass

time.sleep(10)



#プロフィールにアクセス
try:
    btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[1]/span/a/svg/g/circle")
    btn.click()
except:
    pass

for i in range(5):
    try:                       
        name = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div/div/div[3]/div[3]/div/div[1]/div/div/span").text
    except:
        try:
            name = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[3]/div[3]/div/div[2]/div/div").text
        except:                     
            name = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[2]/div[1]/div/div[1]/div/h1").text

    try:
        year = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div/div/div[3]/div[3]/div/div[1]/div/span[2]").text
    except:
        try:
           year = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[3]/div[3]/div/div[1]/div/span[2]").text
        except:
           year = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[2]/div[1]/div/div[1]/span").text

    print(name, year)
    try:
        profile_text = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div/div/div[3]/div[3]/div/div[2]/div/div").text
    except:
        try:
            profile_text = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[3]/div[3]/div/div[2]/div/div").text
        except:
            profile_text = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]").text
              
    print(profile_text)

    html = driver.page_source
    #print('ソース', html)
    soup = BeautifulSoup(html, "html.parser")
    try:
        item_lists = soup.find_all('div', class_="Bdrs(8px) Bgz(cv) Bgp(c) StretchedBox")[0]
    except:
        item_lists = soup.find_all('div', class_="profileCard__slider__img Z(-1)")[0]
    print(item_lists.get("style"))
    p = r'\(([^*]*)\)'
    photo_url = re.findall(p, str(item_lists.get("style")))[0][1:-1]
    print(photo_url)
    response = requests.get(photo_url)
    # サンプルファイルをダウンロードする
    with open(f'{name}_{year}_profile_1.jpg', 'wb') as saveFile:
        saveFile.write(response.content)
    # 詳細ボタンをクリック
    try:              
        btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[3]/div[3]/button")
        btn.click()
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        tag_lists = soup.find_all('div', class_="Bdrs(100px) Bd D(ib) Va(m) Fz($xs) Mend(8px) Mb(8px) Px(8px) Py(4px) Bdc($c-secondary) C($c-secondary)")[0].text
        print(tag_lists)
    except Exception as e:
        print(e.args)

    try:
        btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[1]/span/a/svg")
        btn.click()
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        tag_lists = soup.find_all('div', class_="Bdrs(100px) Bd D(ib) Va(m) Fz($xs) Mend(8px) Mb(8px) Px(8px) Py(4px) Bdc($c-secondary) C($c-secondary)")[0].text
        print(tag_lists)
    except Exception as e:
        print(e.args)

    
    # いいね！する
    try:                                    
        btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div/div/div[4]/div/div[4]/button/span/span")
        btn.click()
    except Exception as e:
        try:                                    
            btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[2]/div/div/div[4]/button/span/span/svg/path")
            btn.click()
        except Exception as e:
            try:
                btn = driver.find_element_by_xpath("/html/body/div[1]/div/div[1]/div/main/div[1]/div/div/div[1]/div[1]/div/div[4]/div/div[4]/button/span/span/svg")
                btn.click()
            except:
                pass
        finally:
            driver.refresh()
            time.sleep(5)                       
    time.sleep(5)