Naver Crawler: Combining DataFrame per each loop P

2019-08-23 11:06发布

I am working on my Naver Crawler (its a Korea Google :P). I have working on this code for a week now, and I have one last task to solve! So my code below shows Data Crawling through Naver API and receiving data to "js" in each loop. All I need to do is combine each dataframe (dfdfdf) and combine at the bottom. But my result always shows the last looped data. Bottom line is that I want to add DataFrame for each loop that I am taking. I tried merge, join but it seems to be not working. Please let me know and if my code below does not make sense (or too dirty) let me know!

import os
import sys
import urllib.request
import pandas as pd
import json
import numpy as np
from datetime import datetime, timedelta
import time

ex = pd.ExcelFile('mat_hierarchy.xlsx').parse('Sheet1') 
DNA1 = []
#adding list to DNA
DNA1.extend(ex.iloc[:,3])
DNA1.extend(ex.iloc[:,2])
seen = set()
DNA = []
for item in DNA1:
    if item not in seen:
        seen.add(item)
        DNA.append(item)

# len(DNA)

#Setting Date weekly or daily
#dd = pd.date_range('2016-01-01',datetime.now().date() - timedelta(2))
dd = pd.date_range(start = '2016-01-01',end = datetime.now().date() - timedelta(2), freq = 'W-MON')

setendDate = datetime.now().date() - timedelta(1)
endDate = setendDate.strftime('%Y-%m-%d') 

#Setting DataFrame & List
Data = pd.DataFrame(index=dd)

#Naver API Connection 
client_id = "ID"
client_secret = "PW"
url = "https://openapi.naver.com/v1/datalab/search";

#Setting requests
body_intro = "{\"startDate\":\"2016-01-01\",\"endDate\":\""
body_endDate = "\",\"timeUnit\":\"date\",\"keywordGroups\":[{\"groupName\":\""
body_keywords = "\",\"keywords\":[\""
body_groupName = "\"]},{\"groupName\":\""
body_last = "\"]}],\"ages\":[\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\"10\",\"11\"]}"

df_list=[]

for i in range(2270,len(DNA),5):
    if((len(DNA)%5==0) or (i < (len(DNA)-(len(DNA)%5)))):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + body_keywords + DNA[i+4] + body_last    
        print("5")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==4):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + body_keywords + DNA[i+3] + body_last    
        print("4")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==3):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + body_keywords + DNA[i+2] + body_last    
        print("3")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    elif(len(DNA)%5==2):
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_groupName + DNA[i+1] + body_keywords + DNA[i+1] + body_last    
        print("2")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    else:
        body = body_intro + endDate + body_endDate + DNA[i] + body_keywords + DNA[i] + body_last 
        print("1")

        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id",client_id)
        request.add_header("X-Naver-Client-Secret",client_secret)
        request.add_header("Content-Type","application/json")
        response = urllib.request.urlopen(request, data=body.encode("utf-8"))
        rescode = response.getcode()
        if(rescode==200):
            response_body = response.read()
            js = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)
        #checking empty values & append to df_list
        d = json.loads(js)
        lst = [pd.DataFrame(r['data']).set_index('period').rename(columns={'ratio' : r['title']}) 
        if len(r['data']) > 0 
        else pd.DataFrame([np.nan], columns=[r['title']], index=[d['startDate']])
        for r in d['results']]
        df = pd.concat(lst, 1)
        dfdfdf = Data.join(df)
        df_list.append(dfdfdf)


    #Combining all Data
    #Naver = Data.join(dfdfdf) 
    print("end")
    time.sleep(.5)
Final = pd.concat(df_list, axis=1)
Final.to_csv("Naver123.csv")

1条回答
萌系小妹纸
2楼-- · 2019-08-23 11:52

Consider using a list of dataframes that are concatenated outside of for loop. Whereas the individual loops run horizontal merge, the final master combine runs a vertical append.

Also, for a DRY-er solution, consider using a defined method that runs the response to dataframe, passing in as a parameter the body variable, the only difference between if blocks.

...
def response_to_df(body):
   request = urllib.request.Request(url)
   request.add_header("X-Naver-Client-Id",client_id)
   request.add_header("X-Naver-Client-Secret",client_secret)
   request.add_header("Content-Type","application/json")
   response = urllib.request.urlopen(request, data=body.encode("utf-8"))
   rescode = response.getcode()
   if(rescode==200):
       response_body = response.read()
       js = response_body.decode('utf-8')
    else:
       print("Error Code:" + rescode)
    d = json.loads(js)
    lst = [pd.DataFrame.from_dict(r['data']).set_index('period')\
                                  .rename(columns={'ratio' : r['title']})
           for r in d['results']]

    # HORIZONTAL MERGE
    df = pd.concat(lst, axis=1)
    df = Data.join(df)
    return df


df_list = []
for i in range(len(DNA), 5):
    if((len(DNA) % 5==0) or (i < (len(DNA) - (len(DNA) % 5)))):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
               body_keywords + DNA[i+3] + body_groupName + DNA[i+4] + \
               body_keywords + DNA[i+4] + body_last    
        print("5")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 4):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_groupName + DNA[i+3] + \
               body_keywords + DNA[i+3] + body_last    
        print("4")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 3):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_groupName + DNA[i+2] + \
               body_keywords + DNA[i+2] + body_last    
        print("3")

        tmp = response_to_df(body)
        df_list.append(tmp)

    elif(len(DNA) % 5 == 2):
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_groupName + DNA[i+1] + \
               body_keywords + DNA[i+1] + body_last    
        print("2")

        tmp = response_to_df(body)
        df_list.append(tmp) 

    else:
        body = body_intro + endDate + body_endDate + DNA[i] + \
               body_keywords + DNA[i] + body_last 
        print("1")

        tmp = response_to_df(body)
        df_list.append(tmp)


# Combining all Data (VERTICAL APPEND)
Naver = pd.concat(df_list, axis=0)
print("ddd")
Naver
查看更多
登录 后发表回答