I am working on my Naver Crawler (its a Korea Google :P). I have working on this code for a week now, and I have one last task to solve! So my code below shows Data Crawling through Naver API and receiving data to "js" in each loop. All I need to do is combine each dataframe (dfdfdf) and combine at the bottom. But my result always shows the last looped data. Bottom line is that I want to add DataFrame for each loop that I am taking. I tried merge, join but it seems to be not working. Please let me know and if my code below does not make sense (or too dirty) let me know!
import os
import sys
import urllib.request
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import time
ex = pd.ExcelFile('mat_hierarchy.xlsx').parse('Sheet1')
DNA1 = []
#adding list to DNA
DNA1.extend(ex.iloc[:,3])
DNA1.extend(ex.iloc[:,2])
seen = set()
DNA = []
for item in DNA1:
if item not in seen:
seen.add(item)
DNA.append(item)
# len(DNA)
#Setting Date weekly or daily
#dd = pd.date_range('2016-01-01',datetime.now().date() - timedelta(2))
dd = pd.date_range(start = '2016-01-01',end = datetime.now().date() - timedelta(2), freq = 'W-MON')
setendDate = datetime.now().date() - timedelta(1)
endDate = setendDate.strftime('%Y-%m-%d')
#Setting DataFrame & List
Data = pd.DataFrame(index=dd)
#Naver API Connection
client_id = "ID"
client_secret = "PW"
url = "https://openapi.naver.com/v1/datalab/search";
#Setting requests
body_intro = "{\"startDate\":\"2016-01-01\",\"endDate\":\""
body_endDate = "\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\""
body_keywords = "\",\"keywords\":[\""
body_groupName = "\"]},{\"groupName\":\""
body_last = "\"]}],\"ages\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]}"
df_list=[]
for i in range(2270,len(DNA),5):
if((len(DNA)%5==0) or (i < (len(DNA)-(len(DNA)%5)))):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + body_keywords + DNA[i+4] + body_last
print("5")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==4):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_last
print("4")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==3):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_last
print("3")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
elif(len(DNA)%5==2):
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_last
print("2")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
else:
body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_last
print("1")
request = urllib.request.Request(url)
request.add_header("X-Naver-Client-Id",client_id)
request.add_header("X-Naver-Client-Secret",client_secret)
request.add_header("Content-Type","application/json")
response = urllib.request.urlopen(request, data=body.encode("utf-8"))
rescode = response.getcode()
if(rescode==200):
response_body = response.read()
js = response_body.decode('utf-8')
else:
print("Error Code:" + rescode)
#checking empty values & append to df_list
d = json.loads(js)
lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']})
if len(r['data']) > 0
else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
for r in d['results']]
df = pd.concat(lst, 1)
dfdfdf = Data.join(df)
df_list.append(dfdfdf)
#Combining all Data
#Naver = Data.join(dfdfdf)
print("end")
time.sleep(.5)
Final = pd.concat(df_list, axis=1)
Final.to_csv("Naver123.csv")
Consider using a list of dataframes that are concatenated outside of
for
loop. Whereas the individual loops run horizontal merge, the final master combine runs a vertical append.Also, for a DRY-er solution, consider using a defined method that runs the response to dataframe, passing in as a parameter the body variable, the only difference between
if
blocks.