Lettura del file come record singolo in hadoop

Ho grandissimo n. di file di piccole dimensioni, voglio utilizzare CombineFileInputFormat per unire i file in modo tale che ogni dato di file abbia un unico record nel mio lavoro MR. Ho seguito http://yaseminavcular.blogspot.in/2011/03/many-small-input-files.html e ha cercato di convertirlo nella nuova APILettura del file come record singolo in hadoop

Sono di fronte 2 problemi:

a) sto solo testando con 2 file di piccole dimensioni, ancora 2 mapper vengono licenziati. Mi aspettavo 1

b) Ogni riga viene come record singolo, voglio l'intero file come record singolo.

Può essere doloroso, ma si prega di esaminare il codice qui sotto. Sono ancora un ingenuo in Hadoop

Il driver

La classe di seguito è in gran copia incolla di LineRecordReader con modifica per inizializzare la funzione di classe

public class MRDriver extends Configured implements Tool { 


@Override 
public int run(String[] args) throws Exception { 
    FileSystem fs = new Path(".").getFileSystem(getConf()); 
    fs.printStatistics(); 
    Job job = new Job(getConf()); 
    job.setJobName("Enron MR"); 
    job.setMapperClass(EnronMailReadMapper.class); 
    job.setOutputKeyClass(Text.class); 
    job.setOutputValueClass(IntWritable.class); 
    job.setNumReduceTasks(0); 
    job.setJarByClass(EnronMailReadMapper.class); 
    RawCombineFileInputFormat.addInputPath(job, new Path(args[0])); 
    job.setOutputFormatClass(TextOutputFormat.class); 
    TextOutputFormat.setOutputPath(job, new Path(args[1])); 
    return job.waitForCompletion(true) ? 0 :1; 
} 

public static void main(String[] args) throws Exception { 
    int exitCode = ToolRunner.run(new MRDriver(), args); 
    System.exit(exitCode); 
}

}() & NextKeyValue()

public class SingleFileRecordReader extends RecordReader<LongWritable, Text> { 
    private static final Log LOG = LogFactory.getLog(SingleFileRecordReader.class); 

    private long start; 
    private long pos; 
    private long end; 
    private LineReader in; 
    private int maxLineLength; 
    private LongWritable key = null; 
    private Text value = null; 

    public void initialize(InputSplit genericSplit, 
         TaskAttemptContext context) throws IOException { 
    FileSplit split = (FileSplit) genericSplit; 
    Configuration job = context.getConfiguration(); 
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", 
            Integer.MAX_VALUE); 
    start = split.getStart(); 
    end = start + split.getLength(); 
    final Path file = split.getPath(); 

    // open the file and seek to the start of the split 
    FileSystem fs = file.getFileSystem(job); 
    FSDataInputStream fileIn = fs.open(split.getPath()); 

     fileIn.seek(start); 
     in = new LineReader(fileIn, job); 
    // If this is not the first split, we always throw away first record 
    // because we always (except the last split) read one extra line in 
    // next() method. 
    if (start != 0) { 
     start += in.readLine(new Text(), 0, maxBytesToConsume(start)); 
    } 
    this.pos = start; 
    } 

    private int maxBytesToConsume(long pos) { 
    return (int) Math.min(Integer.MAX_VALUE, end - pos); 
    } 

    private long getFilePosition() throws IOException { 
    long retVal= pos; 
    return retVal; 
    } 

    public boolean nextKeyValue() throws IOException { 
    if (key == null) { 
     key = new LongWritable(); 
    } 
    key.set(pos); 
    if (value == null) { 
     value = new Text(); 
    } 
    int newSize = 0; 
    StringBuffer totalValue = new StringBuffer(); 
    // We always read one extra line, which lies outside the upper 
    // split limit i.e. (end - 1) 
    while (getFilePosition() <= end) { 
     newSize = in.readLine(value, maxLineLength, 
      Math.max(maxBytesToConsume(pos), maxLineLength)); 
     if (newSize == 0) { 
     break; 
     } 
     totalValue.append(value.toString()+"\n"); 
     pos += newSize; 
     if (newSize < maxLineLength) { 
     break; 
     } 

     // line too long. try again 
     LOG.info("Skipped line of size " + newSize + " at pos " + 
       (pos - newSize)); 
    } 
    if (newSize == 0) { 
     key = null; 
     value = null; 
     return false; 
    } else { 
     value = new Text(totalValue.toString()); 
     return true; 
    } 
    } 

    @Override 
    public LongWritable getCurrentKey() { 
    return key; 
    } 

    @Override 
    public Text getCurrentValue() { 
    return value; 
    } 

    /** 
    * Get the progress within the split 
    */ 
    public float getProgress() throws IOException { 
    if (start == end) { 
     return 0.0f; 
    } else { 
     return Math.min(1.0f, 
     (getFilePosition() - start)/(float)(end - start)); 
    } 
    } 

    public synchronized void close() throws IOException { 
    try { 
     if (in != null) { 
     in.close(); 
     } 
    } finally { 

    } 
    }

}

Ot le sue file

public class RawCombineFileInputFormat extends CombineFileInputFormat <LongWritable,Text>{ 

@Override 
public RecordReader<LongWritable, Text> createRecordReader(
     InputSplit split, TaskAttemptContext context) throws IOException { 
    return new CombineFileRecordReader< LongWritable, Text >((CombineFileSplit) split, context, MultiFileRecordReader.class); 
}

}

public class MultiFileRecordReader extends RecordReader < LongWritable, Text > { 

private CombineFileSplit split; 
private TaskAttemptContext context; 
private int index; 
private RecordReader< LongWritable, Text > rr; 

public MultiFileRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) { 
    this.split = split; 
    this.context = context; 
    this.index = index; 
    this.rr = new SingleFileRecordReader(); 
} 
@Override 
public void initialize(InputSplit split, TaskAttemptContext context) 
     throws IOException, InterruptedException { 
    this.split = (CombineFileSplit) split; 
     this.context = context; 

     if (null == rr) { 
     rr = new SingleFileRecordReader(); 
     } 

     FileSplit fileSplit = new FileSplit(this.split.getPath(index), 
              this.split.getOffset(index), 
              this.split.getLength(index), 
              this.split.getLocations()); 
     this.rr.initialize(fileSplit, this.context); 

} 

@Override 
public boolean nextKeyValue() throws IOException, InterruptedException { 
    // TODO Auto-generated method stub 
    return this.rr.nextKeyValue(); 
} 

@Override 
public LongWritable getCurrentKey() throws IOException, InterruptedException { 
    // TODO Auto-generated method stub 
    return this.rr.getCurrentKey(); 
} 

@Override 
public Text getCurrentValue() throws IOException, InterruptedException { 
    // TODO Auto-generated method stub 
    return this.rr.getCurrentValue(); 
} 

@Override 
public float getProgress() throws IOException, InterruptedException { 
    // TODO Auto-generated method stub 
    return this.rr.getProgress(); 
} 

@Override 
public void close() throws IOException { 
    if (rr != null) { 
      rr.close(); 
      rr = null; 
    }  
}

}

fonte

2013-07-26 S Kr

È necessario impostare la dimensione massima divisione, fino a cui i file sono combinati. Puoi leggerlo nella documentazione di 'CombineFileInputFormat'. –

Date un'occhiata in questo formato.Questo ingresso è un formato di input per la lettura di file multipli in una singola operazione mappa. Verrà letto esattamente un file (senza interruzioni) da ogni record trasmesso al programma di mappatura. WholeFileRecordReader si occupa dell'invio di un contenuto di file come un valore. La chiave restituita è NullWritable e il valore è il contenuto di ogni file nel suo complesso. Ora puoi usare questo ed eseguire il tuo lavoro mapraccar e vedere quanti mappatori effettivamente eseguono e controllare se l'output che stai ottenendo è corretto o meno.

Records sono costruiti da WholeFileRecordReaders.

public class WholeFileInputFormat extends CombineFileInputFormat<NullWritable, Text>{ 

     @Override 
     protected boolean isSplitable(JobContext context, Path file) { 
      return false; 
     } 

/** 
    * Creates a CombineFileRecordReader to read each file assigned to this InputSplit. 
    * Note, that unlike ordinary InputSplits, split must be a CombineFileSplit, and therefore 
    * is expected to specify multiple files. 
    * 
    * @param split The InputSplit to read. Throws an IllegalArgumentException if this is 
    *  not a CombineFileSplit. 
    * @param context The context for this task. 
    * @return a CombineFileRecordReader to process each file in split. 
    *   It will read each file with a WholeFileRecordReader. 
    * @throws IOException if there is an error. 
    */ 

    @Override 
    public RecordReader<NullWritable, Text> createRecordReader(
      InputSplit split, TaskAttemptContext context) throws IOException { 

     if (!(split instanceof CombineFileSplit)) { 
       throw new IllegalArgumentException("split must be a CombineFileSplit"); 
      } 
      return new CombineFileRecordReader<NullWritable, Text>((CombineFileSplit) split, context, WholeFileRecordReader.class); 
    } 

    }

Qui sopra potete WholeFileRecordReader viene utilizzato che è la seguente: -

public class WholeFileRecordReader extends RecordReader<NullWritable, Text> { 
    private static final Logger LOG = Logger.getLogger(WholeFileRecordReader.class); 

     /** The path to the file to read. */ 
     private final Path mFileToRead; 
     /** The length of this file. */ 
     private final long mFileLength; 

     /** The Configuration. */ 
     private final Configuration mConf; 

     /** Whether this FileSplit has been processed. */ 
     private boolean mProcessed; 
     /** Single Text to store the file name of the current file. */ 
    // private final Text mFileName; 
     /** Single Text to store the value of this file (the value) when it is read. */ 
     private final Text mFileText; 

     /** 
     * Implementation detail: This constructor is built to be called via 
     * reflection from within CombineFileRecordReader. 
     * 
     * @param fileSplit The CombineFileSplit that this will read from. 
     * @param context The context for this task. 
     * @param pathToProcess The path index from the CombineFileSplit to process in this record. 
     */ 
     public WholeFileRecordReader(CombineFileSplit fileSplit, TaskAttemptContext context, 
      Integer pathToProcess) { 
     mProcessed = false; 
     mFileToRead = fileSplit.getPath(pathToProcess); 
     mFileLength = fileSplit.getLength(pathToProcess); 
     mConf = context.getConfiguration(); 

     assert 0 == fileSplit.getOffset(pathToProcess); 
     if (LOG.isDebugEnabled()) { 
      LOG.debug("FileToRead is: " + mFileToRead.toString()); 
      LOG.debug("Processing path " + pathToProcess + " out of " + fileSplit.getNumPaths()); 

      try { 
      FileSystem fs = FileSystem.get(mConf); 
      assert fs.getFileStatus(mFileToRead).getLen() == mFileLength; 
      } catch (IOException ioe) { 
      // oh well, I was just testing. 
      } 
     } 

    // mFileName = new Text(); 
     mFileText = new Text(); 
     } 

     /** {@inheritDoc} */ 
     @Override 
     public void close() throws IOException { 
     mFileText.clear(); 
     } 

     /** 
     * Returns the absolute path to the current file. 
     * 
     * @return The absolute path to the current file. 
     * @throws IOException never. 
     * @throws InterruptedException never. 
     */ 
     @Override 
     public NullWritable getCurrentKey() throws IOException, InterruptedException { 
     return NullWritable.get(); 
     } 

     /** 
     * <p>Returns the current value. If the file has been read with a call to NextKeyValue(), 
     * this returns the contents of the file as a BytesWritable. Otherwise, it returns an 
     * empty BytesWritable.</p> 
     * 
     * <p>Throws an IllegalStateException if initialize() is not called first.</p> 
     * 
     * @return A BytesWritable containing the contents of the file to read. 
     * @throws IOException never. 
     * @throws InterruptedException never. 
     */ 
     @Override 
     public Text getCurrentValue() throws IOException, InterruptedException { 
     return mFileText; 
     } 

     /** 
     * Returns whether the file has been processed or not. Since only one record 
     * will be generated for a file, progress will be 0.0 if it has not been processed, 
     * and 1.0 if it has. 
     * 
     * @return 0.0 if the file has not been processed. 1.0 if it has. 
     * @throws IOException never. 
     * @throws InterruptedException never. 
     */ 
     @Override 
     public float getProgress() throws IOException, InterruptedException { 
     return (mProcessed) ? (float) 1.0 : (float) 0.0; 
     } 

     /** 
     * All of the internal state is already set on instantiation. This is a no-op. 
     * 
     * @param split The InputSplit to read. Unused. 
     * @param context The context for this task. Unused. 
     * @throws IOException never. 
     * @throws InterruptedException never. 
     */ 
     @Override 
     public void initialize(InputSplit split, TaskAttemptContext context) 
      throws IOException, InterruptedException { 
     // no-op. 
     } 

     /** 
     * <p>If the file has not already been read, this reads it into memory, so that a call 
     * to getCurrentValue() will return the entire contents of this file as Text, 
     * and getCurrentKey() will return the qualified path to this file as Text. Then, returns 
     * true. If it has already been read, then returns false without updating any internal state.</p> 
     * 
     * @return Whether the file was read or not. 
     * @throws IOException if there is an error reading the file. 
     * @throws InterruptedException if there is an error. 
     */ 
     @Override 
     public boolean nextKeyValue() throws IOException, InterruptedException { 
     if (!mProcessed) { 
      if (mFileLength > (long) Integer.MAX_VALUE) { 
      throw new IOException("File is longer than Integer.MAX_VALUE."); 
      } 
      byte[] contents = new byte[(int) mFileLength]; 

      FileSystem fs = mFileToRead.getFileSystem(mConf); 
      FSDataInputStream in = null; 
      try { 
      // Set the contents of this file. 
      in = fs.open(mFileToRead); 
      IOUtils.readFully(in, contents, 0, contents.length); 
      mFileText.set(contents, 0, contents.length); 

      } finally { 
      IOUtils.closeStream(in); 
      } 
      mProcessed = true; 
      return true; 
     } 
     return false; 
     } 

}

Di seguito è riportato il codice del driver: -

public int run(String[] arg) throws Exception { 
    Configuration conf=getConf(); 
    FileSystem fs = FileSystem.get(conf); 
    //estimate reducers 
    Job job = new Job(conf); 
    job.setJarByClass(WholeFileDriver.class); 
    job.setJobName("WholeFile"); 
    job.setOutputKeyClass(Text.class); 
    job.setOutputValueClass(Text.class); 
    job.setInputFormatClass(WholeFileInputFormat.class); 
    job.setMapperClass(WholeFileMapper.class); 
    job.setNumReduceTasks(0); 

    FileInputFormat.addInputPath(job, new Path(arg[0])); 
    Path output=new Path(arg[1]); 
    try { 
     fs.delete(output, true); 
    } catch (IOException e) { 
     LOG.warn("Failed to delete temporary path", e); 
    } 
    FileOutputFormat.setOutputPath(job, output); 

    boolean ret=job.waitForCompletion(true); 
    if(!ret){ 
     throw new Exception("Job Failed"); 
    }

fonte

2013-07-26 12:14:47 Binary01

grazie sig. binario ... non avevo aggiunto job.setInputFormatClass (WholeFileInputFormat.class). Mi stavo chiedendo anche quando avevo aggiunto RawCombineFileInputFormat.addInputPath (job, new Path (args [0])); perché il tipo di formato non è stato rilevato. Qualsiasi ans? –

Lettura del file come record singolo in hadoop

risposta

Problemi correlati