import requests
import re
import datetime
from bs4 import BeautifulSoup
from pymongo import MongoClient
import pymongo
username = 'davelee'
password = 'happy91'
connection = pymongo.MongoClient('mongodb://%s:%s@www.funcoding.xyz' % (username, password))
mongodb = connection.cine21
actor_collection = mongodb.actor_collection
actor_list = actor_collection.find()
for actor in actor_list:
print(actor['actor'])
cine21_url = 'http://www.cine21.com/rank/person/content'
month = "2017-10"
conditions = dict()
conditions['section'] = 'actor'
conditions['period_start'] = month
conditions['gender'] = 'all'
conditions['page'] = 1
response = requests.post(cine21_url, data = conditions)
response
response.content
soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
soup
actors = soup.select('li.people_li div.name')
actors
import re
for actor in actors:
print(re.sub("\(\w+\)", "", actor.text))
actor_detail_info = list()
for actor in actors:
actor_info_dict = dict()
actor_info = 'http://www.cine21.com' + actor.select_one('a').attrs['href']
response_actor = requests.get(actor_info)
soup_actor = BeautifulSoup(response_actor.content.decode('utf-8'), 'html.parser')
actor_datas = soup_actor.select('ul.default_info')
for actor_data in soup_actor.select('ul.default_info'):
for actor_item in actor_data.select('li'):
actor_item_text = re.sub('<span.*?>.*?</span>', '', str(actor_item))
actor_item_text = re.sub('<.+?>', '', actor_item_text)
actor_info_dict[actor_item.select_one('span.tit').text] = actor_item_text.strip()
actor_detail_info.append(actor_info_dict)
print(actor_detail_info)
actor_info_dict = dict()
actor_info = 'http://www.cine21.com/db/person/info/?person_id=19889'
response_actor = requests.get(actor_info)
soup_actor = BeautifulSoup(response_actor.content.decode('utf-8'), 'html.parser')
actor_datas = soup_actor.select('ul.default_info')
for actor_data in soup_actor.select('ul.default_info'):
for actor_item in actor_data.select('li'):
actor_item_text = re.sub('<span.*?>.*?</span>', '', str(actor_item))
actor_item_text = re.sub('<.+?>', '', actor_item_text)
actor_info_dict[actor_item.select_one('span.tit').text] = actor_item_text.strip()
print(actor_info_dict)
actor_info_dict = dict()
actor_info = 'http://www.cine21.com/db/person/info/?person_id=19889'
response_actor = requests.get(actor_info)
soup_actor = BeautifulSoup(response_actor.content.decode('utf-8'), 'html.parser')
soup_actor
actor_datas = soup_actor.select('ul.default_info')
actor_datas
for actor_data in soup_actor.select('ul.default_info'):
for actor_item in actor_data.select('li'):
actor_item_text = re.sub('<span.*?>.*?</span>', '', str(actor_item))
actor_item_text = re.sub('<.+?>', '', actor_item_text)
actor_info_dict[actor_item.select_one('span.tit').text] = actor_item_text.strip()
print(actor_info_dict)
actor_rates = soup.select('li.people_li ul.num_info strong')
for actor_rate in actor_rates:
print(actor_rate.text)
actor_rate_list = list()
for actor_rate in actor_rates:
actor_rate_list.append(int(actor_rate.text.replace(",",""))) # int() 로 해주지 않으면, 문자열로 됩니다.
actor_rate_list
actor_list = list()
for actor in actors:
actor_list.append(re.sub("\(\w+\)", "", actor.text))
actor_list
movie_list = list()
movies = soup.select('li.people_li ul.mov_list')
for movie in movies:
actor_movie = list()
movie_titles = movie.select('li a span')
for movie_title in movie_titles:
actor_movie.append(movie_title.text)
movie_list.append(actor_movie)
movie_list
for num, actor in enumerate(actor_list):
actor_collection.insert_one(
{"actor":actor_list[num],
"actor_details": actor_detail_info[num],
"actor_rate":actor_rate_list[num],
"date":month,
"movie_list":movie_list[num]})
docs = actor_collection.find()
for doc in docs:
print(doc)
actor_collection.drop()
docs = actor_collection.find()
for doc in docs:
print(doc)
actor_info = list()
for num, actor in enumerate(actor_list):
actor_info.append(
{"actor":actor_list[num],
"actor_details": actor_detail_info[num],
"actor_rate":actor_rate_list[num],
"date":month,
"movie_list":movie_list[num]}
)
actor_info
actor_collection.insert_many(actor_info)
docs = actor_collection.find()
for doc in docs:
print(doc)
actor_collection.update_many( {}, { "$rename": { "actor": "actor_name" } } )
docs = actor_collection.find()
for doc in docs:
print(doc)
actor_collection.update_many( {}, { "$rename": { "actor_details": "actor_info" } } )
docs = actor_collection.find()
for doc in docs:
print(doc)
actor = actor_collection