2016-05-13 15 views
6

Sto cercando di raggruppare valori in alcuni file csv in contenitori che si trovano in un file XML (groups.xml). Ho il seguente codice che funziona in una certa misura, ma non dà quello che mi aspetto:Raggruppamento di valori utilizzando panda tagliato

import os, sys 
import glob 
import pandas as pd 
import xml.etree.cElementTree as ET 

def xml_parse(): 
    try: 
     os.chdir("path/to/files") 
     filename = [file1 for file1 in glob.glob("*.csv")] 
     filename = [i.split('.', 1)[0] for i in filename] 
     #filename = '\n'.join(filename) 
     os.chdir('..') 
     output = [] 
     doc = ET.parse("groups.xml").getroot() 
     for root_ele in doc.findall('Groups'): 
      tag_ele = root_ele.find('GroupID').text 
      for name in filename: 
       if name == tag_ele.lower(): 
        for root_ele1 in root_ele.findall('groupname'): 
         displayname = root_ele1.find('Name').text 
         minval = root_ele1.find('min').text 
         mininc = root_ele1.find('minInc').text 
         maxvalue = root_ele1.find('max') 
         maxinclusive = root_ele1.find('maxInc') 
         lists = [] 
         frame = pd.DataFrame() 
         fname = "path/to/files" + name + ".csv" 
         df = pd.read_csv(fname, index_col=None, header=None) 
         lists.append(df) 
         frame = pd.concat(lists) 
         if maxvalue is not None: 
          maxval = maxvalue.text 
          if maxinclusive is not None: 
           maxinc = maxinclusive.text 
           df['bin'] = pd.cut(frame[1], [float(minval),float(maxval)], right= maxinc, include_lowest= mininc) 
           out = str(pd.concat([df['bin'], frame[1]], axis=1)) 
           out = out.split("\n")[2:] 
           for a in out: 
            print a 
         else: 
          df['bin'] = pd.cut(frame[1], [float(minval)], include_lowest= mininc) 
          out = str(pd.concat([df['bin'], frame[1]], axis=1)) 
          out = out.split("\n")[2:] 
          for a in out: 
           print a 
      break 
    except AttributeError: 
     pass 

Uscita in corrente:

1 NaN 10.18 
2 NaN 25.16 
3 NaN 44.48 
4 NaN 85.24 
5 NaN 36.71 
6 NaN 77.09 
7 NaN 81.88 
8 NaN 22.92 
9 NaN 44.31 
10 NaN 15.79 
1 [10, 18] 10.18 
2  NaN 25.16 
3  NaN 44.48 
4  NaN 85.24 
5  NaN 36.71 
6  NaN 77.09 
7  NaN 81.88 
8  NaN 22.92 
9  NaN 44.31 
10 [10, 18] 15.79 
1  NaN 10.18 
2 [18, 35] 25.16 
3  NaN 44.48 
4  NaN 85.24 
5  NaN 36.71 
6  NaN 77.09 
7  NaN 81.88 
8 [18, 35] 22.92 
9  NaN 44.31 
10  NaN 15.79 
1  NaN 10.18 
2  NaN 25.16 
3 [35, 50] 44.48 
4  NaN 85.24 
5 [35, 50] 36.71 
6  NaN 77.09 
7  NaN 81.88 
8  NaN 22.92 
9 [35, 50] 44.31 
10  NaN 15.79 
1 NaN 10.18 
2 NaN 25.16 
3 NaN 44.48 
4 NaN 85.24 
5 NaN 36.71 
6 NaN 77.09 
7 NaN 81.88 
8 NaN 22.92 
9 NaN 44.31 
10 NaN 15.79 
1 NaN 10.18 
2 NaN 25.16 
3 NaN 44.48 
4 NaN 85.24 
5 NaN 36.71 
6 NaN 77.09 
7 NaN 81.88 
8 NaN 22.92 
9 NaN 44.31 
10 NaN 15.79 

con un errore:

Traceback (most recent call last): 
    File "groups.py", line 69, in <module> 
    xml_parse() 
    File "groups.py", line 44, in xml_parse 
    df['bin'] = pd.cut(frame[1], [float(minval)], include_lowest= mininc) 
    File "C:\Python27\lib\site-packages\pandas\tools\tile.py", line 113, in cut 
    include_lowest=include_lowest) 
    File "C:\Python27\lib\site-packages\pandas\tools\tile.py", line 203, in _bins_to_cuts 
    include_lowest=include_lowest) 
    File "C:\Python27\lib\site-packages\pandas\tools\tile.py", line 252, in _format_levels 
    levels[0] = '[' + levels[0][1:] 
IndexError: list index out of range 

Uscita prevista:

1 [10, 18] 10.18 
2 [18, 35] 25.16 
3 [35, 50] 44.48 
4 [>= 75] 85.24 #however >=75 can be represented 
5 [35, 50] 36.71 
6 [>= 75] 77.09 
7 [>= 75] 81.88 
8 [18, 35] 22.92 
9 [35, 50] 44.31 
10 [10, 18] 15.79 

risposta

3

A partire da:

df: 

    val1 val2 
0  NaN 10 
1 10.18  1 
2 25.16  1 
3 44.48  1 
4 85.24  1 
5 36.71  1 
6 77.09  1 
7 81.88  1 
8 22.92  1 
9 44.31  1 
10 15.79  1 

e

xml = """ 
<metaGroups> 
    <Groups> 
     <GroupID>age</GroupID> 
     <description>age</description> 
     <groupname> 
      <Name>0 - &lt;10</Name> 
      <min>0</min> 
      <minInc>TRUE</minInc> 
      <max>10</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>10 - &lt;18</Name> 
      <min>10</min> 
      <minInc>TRUE</minInc> 
      <max>18</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>18 - &lt;35</Name> 
      <min>18</min> 
      <minInc>TRUE</minInc> 
      <max>35</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>35 - &lt;50</Name> 
      <min>35</min> 
      <minInc>TRUE</minInc> 
      <max>50</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>50 - &lt;65</Name> 
      <min>50</min> 
      <minInc>TRUE</minInc> 
      <max>65</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>65 - &lt;75</Name> 
      <min>65</min> 
      <minInc>TRUE</minInc> 
      <max>75</max> 
      <maxInc>FALSE</maxInc> 
     </groupname> 
     <groupname> 
      <Name>&amp;ge;75</Name> 
      <min>75</min> 
      <minInc>TRUE</minInc> 
     </groupname> 
    </Groups> 
</metaGroups> 
""" 

È possibile utilizzare BeautifulSoup per estrarre i parametri bin, costruire le etichette e applicare pd.cut():

from bs4 import BeautifulSoup as Soup 
from itertools import chain 

soup = Soup(xml, 'html.parser') 

bins = [] 
for message in soup.findAll('groupname'): 
    min = message.find('min').text 
    try: 
     max = message.find('max').text 
     bins.append([min, max]) 
    except: 
     bins.append([min]) # For max bin 

a quel punto abbiamo

bins 

[['0', '10'], ['10', '18'], ['18', '35'], ['35', '50'], ['50', '65'], ['65', '75'], ['75']] 

Successivamente, sarà appiattire la list di list, sbarazzarsi di duplicati e aggiungere un limite superiore:

labels = bins 
bins = list(np.unique(np.fromiter(chain.from_iterable(bins), dtype='int'))) 
last = bins[-1] 
bins.append(int(df.val1.max() + 1)) 

che produce:

[0, 10, 18, 35, 50, 65, 75, 86] 

Costruire le etichette:

labels = ['[{0} - {1}]'.format(label[0], label[1]) if len(label) > 1 else '[ > {} ]'.format(label[0]) for label in labels] 

e utilizzando pd.cut():

df['binned'] = pd.cut(df.val1, bins=bins, labels=labels) 

produce:

 val1 val2  binned 
1 10.18  1 [10 - 18] 
2 25.16  1 [18 - 35] 
3 44.48  1 [35 - 50] 
4 85.24  1 [>= 75] 
5 36.71  1 [35 - 50] 
6 77.09  1 [>= 75] 
7 81.88  1 [>= 75] 
8 22.92  1 [18 - 35] 
9 44.31  1 [35 - 50] 
10 15.79  1 [10 - 18] 
+0

Grazie per la risposta. 'bin' non restituisce nulla per me. – pam

+0

Non avevo incluso la stringa 'xml', vedere la risposta aggiornata. – Stefan

+0

Ho usato il mio file xml. Sembra che non l'ho letto correttamente. Sta lavorando adesso. Grazie mille! – pam

Problemi correlati