To speed up the process of extracting the required keys from multiple files and writing them to a CSV file

Asked 2 years ago, Updated 2 years ago, 79 views

We extract the necessary keys from multiple files and export the list to a single csv file.

  • Original data has approximately 180,000 lines per file (about 40kB), a total of approximately 8GB
  • As a result of extracting the required data, the file will be about 10MB

I was able to check the operation, but it took too long to process, and if you made it with Perl, it will take about 240 seconds to complete, but if you write the code with C#, it will take about an hour to complete.It hasn't been long since I started studying C#, but I don't know if there's a useless processing or a strange loop in the code.

Please let me know if there is a way to write this process as soon as possible.
Thank you for your cooperation.

using System;
using System.Collections.General;
using System.IO;
using System.Linq;

namespace ConsoleApp6
{
    classDatRowValues
    {
        public string ProcessData {get;set;}
        public string KeyValue {get;set;}
        public string IntValue {get;set;}
        public string StringValue {get;set;}
        public string value {get;set;}
    }
    class NewDatRowValues
    {
        public string EqpId {get;set;}
        public string LotId {get;set;}
        public string WaferId {get;set;}
        public DateTime SDate {get;set;}
        public string TempBRINECoolant {get;set;}
    }
    classNewDatRowValuesMapper—CsvHelper.Configuration.ClassMap<NewDatRowValues>
    {
        public NewDatRowValuesMapper()
        {
            Map(x=>x.EqpId).Index(0);
            Map(x=>x.LotId).Index(1);
            Map(x=>x.WaferId).Index(2);
            Map(x=>x.SDate).Index(3).TypeConverterOption.Format("yyyy/MM/dd HH:mm:ss");
            Map(x=>x.TempBRINECoolant).Index(4);
        }
    }
    class program
    {
        static void Main (string[]args)
        {
            // A container is prepared for writing.
            varwriteDatList=newList<NewDatRowValues>();;

            // loading
            foreach(string fileName in Directory.GetFiles(@"C:\20190403", "*.dat")))
                using(varsr=new StreamReader(fileName, System.Text.Encoding.GetEncoding("shift_jis"))
                using(var inputDat=newCsvHelper.CsvReader(sr))
                {
                    inputDat.Configuration.HasHeaderRecord=false;

                    // Extract only required key lines
                    vardat=inputDat.GetRecords<DatRowValues>();
                    targetRows=dat.Where(r=>
                    r.KeyValue=="EQP_ID"||
                    r.KeyValue=="LOT_ID"||
                    r.KeyValue=="WAFER_ID"||
                    r.KeyValue=="S_DATE"||
                    r.KeyValue=="TempBRINECoolant";

                    // Store each value
                    var newRow=newNewDatRowValues();
                    foreach (varrow in targetRows)
                    {
                        if(row.KeyValue=="EQP_ID")
                        {
                            newRow.EqpId=row.StringValue;
                        }
                        if(row.KeyValue=="LOT_ID")
                        {
                            newRow.LotId=row.StringValue;
                        }
                        if(row.KeyValue=="WAFER_ID")
                        {
                            newRow.WaferId=row.StringValue;
                        }
                        if(row.KeyValue=="S_DATE")
                        {
                            newRow.SDate=DateTime.Parse(row.StringValue);
                        }
                        if(row.KeyValue=="TempBRINECoolant")
                        {
                            newRow.TempBRINECoolant=row.StringValue;
                        }
                    }
                    writeDatList.Add(newRow);
                }
            // beginning of writing
            using(varsw=new StreamWriter(@"C:\test\list1.csv"))
            using(var outputDat=newCsvHelper.CsvWriter(sw))
            {
                var writingList = writeDatList.GroupBy(r=>r.EqpId.Substring(0,4))
                    .Where(g=>g.Count()>1)
                    .SelectMany(g=>g)
                    .ToList();
                outputDat.Configuration.HasHeaderRecord=false;
                outputDat.Configuration.RegisterClassMap<NewDatRowValuesMapper>();
                outputDat.WriteRecords (writingList);
            }
        }
    }
}

Original data (1 file content)

 \\, AACZ12501_93G25701901, 93G257019-18, TSN.PR, TSN-LCT, AACZ12501, April 4, 2019 00:00:31, Actual Processing Data
ProcessData, LOT_ID, 3, AP0077130.00C,
ProcessData, LOT_ID_SUB, 3, AP0077130.00,
ProcessData, LOT_NO, 3, AP0077130,
ProcessData, WAFER_ID, 3, AP0077130.18,
ProcessData, WAFER_NO, 1,18,
ProcessData, PRODSPEC_ID, 3, T6BD60001-00001.00,
ProcessData, PRODGRP_ID, 3, T6BD6,
ProcessData, PRODGRP_BIND, 3, T6BD6,
ProcessData, MAIN_MAINPD_ID, 3,A5L501PC.00,
ProcessData, MAINPD_ID, 3,A5L501PC.00,
ProcessData, FLOW_TYPE, 3, Main,
ProcessData, FLOW_TYPE_NO, 1,1,
ProcessData, D_SEQNO, 1,89,
ProcessData, OP_NO, 3, TSNCT Coat.MA1,
ProcessData, OP_NO_NAME, 3, Processing,
ProcessData, PD_IDENT, 3, KTSNIMA 1.00,
ProcessData, PD_IDENT_NAME, 3, COAT,
ProcessData, EQP_GROUP_CODE, 3, PCOT,
ProcessData, EQP_GROUP_NAME, 3, RESIST C/T,
ProcessData, EQP_GROUP_BIND, 3, PCOT,
ProcessData, EQP_ID, 3, PCOT003,
ProcessData, PH_RECIPE_ID, 3084,
ProcessData, RCP_NAME_SPACE, 3, PEPPR,
ProcessData, LC_RECIPE_ID, 3, V146G-420-10 + AQ7.00,
ProcessData, RECIPE_ID, 3, PEPPR.084,
ProcessData, S_DATE, 4, April 3, 2019 23:48:08,
ProcessData, E_DATE, 4, April 4, 2019 00:00:31,
ProcessData, CAST_ID, 3, PA0-01239,
ProcessData, SLOT_NO, 1,18,
ProcessData, DEPT_CODE, 3, DEPT,
ProcessData, HIST_S_DATE_1,4, April 3, 2019 23:48:08,
ProcessData, HIST_E_DATE_1,4, April 4, 2019 00:00:31,
ProcessData, Clock_C, 3, 2019040400003155,
ProcessData, EventName_C, 3, STS At Destination,
ProcessData, SubstID_C, 3, AP0077130.18,
ProcessData, ProcessJobID_C, 3, AP0077130.01,
ProcessData, PPID_C, 3, RegFlowRcpClass/084,
ProcessData, ControlJobID_C, 3, PCOT003-20190403-0053,
ProcessData, WaferSequenceNo_C, 3,18,
ProcessData, SubstProcState_C, 3, 2,
ProcessData, _TCT-02_Cuptemp., 2,23.07,
ProcessData, _TCT-02_Cup humidity, 2,45.26,
ProcessData, _TCT-02_Resist temp., 2,23.00,
ProcessData, _TCT-02_Motor flange temp., 2,23.00,
ProcessData, _TCT-02_Solvent bath flow, 2,0.0,
ProcessData, _TCT-02_Backrince flow 1, 2, 0.0,
ProcessData, _TCT-02_Backrince flow 2, 2, 0.0,
ProcessData, _TCT-02_Drain case Rinse, 2.0,
ProcessData, _TCT-02_Backrace1+2 flow, 2,64.8,
ProcessData, _TCT-02_Sideline flow, 2, 4.8,
ProcessData, _TCT-02_Cup window velocity, 2,0.38,
ProcessData, _COT-02_Cuptemp., 2,23.08,
ProcessData, _COT-02_Cup humidity, 2,45.31,
ProcessData, _COT-02_Resist temp., 2,23.02,
ProcessData, _COT-02_Motor flange temp., 2,23.00,
ProcessData, _COT-02_Solvent bath flow, 2,0.0,
ProcessData, _COT-02_RRC Nozzle flow, 2, 2.5,
ProcessData, _COT-02_Drain case Rinse, 2.0,
ProcessData, _COT-02_Backrace1+2flow, 2,81.7,
ProcessData, _COT-02_Sideline flow, 2,21.4,
ProcessData, _COT-02_Cup window velocity, 20.40,
ProcessData, _ADH-02_Plate temp., 2100.03,
ProcessData, _ADH-02_HMDS flow, 2,5553.9,
ProcessData, _CPL-03_Plate temp., 2,24.00,
ProcessData, _CPL-05_Plate temp., 2,23.00,
ProcessData, _PHP-03_Plate temp., 2,109.99,
ProcessData, _PHP-03_Plate temp.1,2,109.98,
ProcessData, _PHP-03_Plate temp.2, 2,110.02,
ProcessData, _PHP-03_Plate temp.3, 2,110.02,
ProcessData, _PHP-03_Plate temp.4, 2,110.01,
ProcessData, _PHP-03_Plate temp.5, 2,109.99,
ProcessData, _PHP-03_Plate temp.6, 2,109.98,
ProcessData, _PHP-03_Plate temp.7, 2,109.99,
ProcessData, SlotStatus_C, 3, 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 000,
ProcessData, PortID_C, 3, 1,
ProcessData, LotID_C, 3, AP0077130.00C,
ProcessData, SubstDestination_C, 3, PA0-01239.18,
ProcessData, SubstLocID1_C, 3, PA0-01239.18,
ProcessData, Timein1_C, 3, 2019040323394290,
ProcessData, Timeout1_C, 3, 2019040323480827,
ProcessData, SubstLocID2_C, 3, [2-05] TRS01,
ProcessData, Timein2_C, 3, 2019040323481433,
ProcessData, Timeout2_C, 3, 2019040323482607,
ProcessData, SubstLocID3_C, 3, [2-15]ADH02,
ProcessData, Timein3_C, 3, 2019040323482957,
ProcessData, Timeout 3_C, 3, 2019040323493000,
ProcessData, SubstLocID4_C, 3, [2-21]CPL05,
ProcessData, Timein4_C, 3, 2019040323493567,
ProcessData, Timeout 4_C, 3, 2019040323503524,
ProcessData, SubstLocID5_C, 3, [2-02]COT02,
ProcessData, Timein5_C, 3,2019040323504071,
ProcessData, Timeout 5_C, 3, 2019040323514101,
ProcessData, SubstLocID6_C, 3, [2-24] PHP03,
ProcessData, Timein6_C, 3, 2019040323514584,
ProcessData, Timeout 6_C, 3, 2019040323533013,
ProcessData, SubstLocID7_C, 3, [2-17] CPL03,
ProcessData, Timein7_C, 3, 2019040323533561,
ProcessData, Timeout 7_C, 3, 2019040323563468,
ProcessData, SubstLocID8_C, 3, [2-04] TCT02,
ProcessData, Timein8_C, 3, 2019040323564008,
ProcessData, Timeout8_C, 3,2019040400001710,
ProcessData, SubstLocID9_C, 3, [2-06] TRS02,
ProcessData, Timein9_C, 3,2019040400002061,
ProcessData, Timeout 9_C, 3, 2019040400002521,
ProcessData, SubstMtrlStatus_C, 3, 0,
ProcessData, SubstSource_C, 3, PA0-01239.18,
ProcessData, SubstState_C, 3, 2,
ProcessData, SubstType_C, 3,0,
ProcessData, SubstUsage_C, 3,0,
ProcessData, CLOCK1_C, 3, 2019040323480827,
ProcessData, CLOCK2_C, 3, 2019040400003153,

After the output, some of the contents of the csv file are as follows:

PCOT003AP0077130.00CAP0077130.18 April 3, 2019 23:48
PCOT004 AP0077164.00C AP0077164.16 2019/4/3 23:49
PCOT004 AP0077164.00C AP0077164.17 April 3, 2019 23:50
PCOT008 AP00767.00C AP00766.01 April 3, 2019 23:56
PCOT001 SP0008774.00C SP0008774.02 April 3, 2019 23:50
PCOT002 SP0009131.00C SP0009131.03 April 3, 2019 23:53
PCOT002 SP0009131.00C SP0009131.02 April 3, 2019 23:53
PCOT001 SP0008774.00C SP0008774.03 April 3, 2019 23:50
PCOT008 AP00767.00C AP00766.03 April 3, 2019 23:56
PCOT008 AP00767.00C AP0076967.02 April 3, 2019 23:56
PCOT004 AP0077164.00C AP0077164.18 April 3, 2019 23:50
PCOT002 SP0009131.00C SP0009131.04 April 3, 2019 23:54
PCOT008 AP00767.00C AP00766.04 April 3, 2019 23:56
PCOT004 AP0077164.00C AP0077164.20 2019/4/3 23:52
PCOT004 AP0077164.00C AP0077164.19 2019/4/3 23:51
PCOT003 AP0077130.00C AP0077130.19 2019/4/3 23:48
PCOT002 SP0009131.00C SP0009131.06 April 3, 2019 23:55
PCOT002 SP0009131.00C SP0009131.05 April 3, 2019 23:54
PCOT001 SP0008774.00C SP0008774.05 April 3, 2019 23:50

Outputs one line from one file.

I received the Perl code below, so I will list it for your reference only.

use strict;
use warnings;


my$dirname='C:\Users\0020316094\Desktop\Perl\development_1\data';
my@list;
my$count = 0;
my@Lot;
my@Waf;
my@Eqp;
my@Date;
my@lot_id;
my@waf_id;
my@eqp_id;
my@date_id;

my$start_time=time;

openir(DIR,$dirname) or die "$dirname:$!";
while(my$dir=readdir(DIR)){
    Next if $direq''.'||$direq'...'||$direq'test.txt'||$direq'file_get.pl';#. Exclude Action
    push@list,$dir;# put in array
}
closedir (DIR);


foreach(@list){

#   open(FILE, "<", "C:\Users\0020316094\Desktop\Perl\development_1\data\$list[$count]") or die "$!";
    open(FILE, "<", "$list[$count]") or die "$!";# Before Modification

    while(my$line=<FILE>){

        @lot_id=split(/,/,$line)if$line=~/,LOT_ID,/;
        $Lot[$count] = $lot_id[3];
        @waf_id=split(/,/,$line)if$line=~/,WAFER_ID,/;
        $Waf[$count] = $waf_id[3];
        @eqp_id = split(/, /, $line) if $line = ~/, EQP_ID, /;
        $Eqp[$count] = $eqp_id[3];
        @date_id=split(/,/,$line)if$line=~/,S_DATE,/;
        $Date[$count] = $date_id[3];
    }
    close (FILE);
$count++
}
my$count1 = 0;
open(TXT, ">>test.txt") or die "$!";
foreach(@Lot){
    print TXT "$Lot[$count1], $Waf[$count1], $Eqp[$count1], $Date[$count1]\n";
    $count1++
}
close (TXT);

c#

2022-09-30 21:39

4 Answers

I have 180,000 files to load, which is about 8GB, so
How long does it take to load?

How long does it take to read everything and finish it without doing anything?
One or two minutes? About 50 minutes?If this takes a long time,
The IO of the file is the only thing that takes time, so the search part is
I can't improve it.

First of all, we need to find out where it's taking us. Isn't it bad?


2022-09-30 21:39

First of all, it would be better to find out where the processing load is.
In this case, it is possible that the file loading itself is heavy or the CsvHelper is heavy.

First, simply open with StreamReader and discard with ReadToEnd() to measure the load on the load itself.If this alone takes nearly an hour, it is difficult to improve because of the .NET file access specification.What happens in parallel?

CsvHelper can be heavy if the load on the load itself is not an issue in the previous validation.This seems to be a relatively simple CSV format, so you can expect the speed to be faster if you implement string operations yourself.

With VisualStudio, the diagnostic tool will help you determine the load on a functional basis, so you may be able to use it for detailed research.


2022-09-30 21:39

I wrote the analysis process equivalent to Perl.

using System.Collections.General;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;

namespaceConsoleApp6 {
    US>classOutput {
        public string LotId {get;set;}
        public string WaferId {get;set;}
        public string EqpId {get;set;}
        public string SDate {get;set;}
    }

    US>class Program {
        static Output Convert (string fileName) {
            var output = new Output();
            foreach(var line in File.ReadLines(fileName,Encoding.Default)){
                if (Regex.IsMatch(line, OT, LOT_ID, ))))
                    output.LotId=line.Split(',')[3];
                if(Regex.IsMatch(line,",WAFER_ID,"))
                    output.WaferId=line.Split(',')[3];
                if(Regex.IsMatch(line,",EQP_ID,"))
                    output.EqpId=line.Split(',')[3];
                if(Regex.IsMatch(line,",S_DATE,"))
                    output.SDate=line.Split(',')[3];
            }
            return output;
        }
        static IEnumerable <Output>Read(){
            return Directory.EnumerateFiles(@"C:\20190403", "*.dat")
                // If you remove the comments in the next line, it will be processed in multi-threaded parallel.
                // .AsParallel()
                .Select(fileName=>Convert(fileName));
        }
        static void Main() {
            using(varsw=new StreamWriter(@"C:\test\list1.csv")}{
                foreach(var output in Read())
                    sw.WriteLine($"{output.LotId}, {output.WaferId}, {output.EqpId}, {output.SDate}";
            }
        }
    }
}

I don't know if it will get faster, but I tried to play with it as far as I could think of.Comments are written as part of the answer rather than to the code.

using CsvHelper;
using CsvHelper.Configuration.Attributes;
using System.Collections.General;
using System.IO;
using System.Linq;
using System.Text;

namespaceConsoleApp6 {
    // I tried to read and discard unnecessary columns.
    US>class Input {
        Index(1)
        public string key {get;set;}
        Index(3)
        public string value {get;set;}
    }
    US>classOutput {
        public string EqpId {get;set;}
        public string LotId {get;set;}
        public string WaferId {get;set;}
        // I tried not to analyze the date.
        public string SDate {get;set;}
        public string TempBRINECoolant {get;set;}
    }

    US>class Program {
        static IEnumerable <Output>Read(){
            return Directory.EnumerateFiles(@"C:\20190403", "*.dat")
                // If you remove the comments in the next line, it will be processed in multi-threaded parallel.
                // .AsParallel()
                .Select(fileName=>{
                    using(varsr=new StreamReader(fileName,Encoding.Default))
                    using(var reader=new CsvReader(sr)) {
                        reader.Configuration.HasHeaderRecord=false;
                        var output = new Output();
                        // I deleted the pre-filtering in Where because it was only twice too much work.
                        foreach(varrow in reader.GetRecords<Input>()){
                            // The number of comparisons was increasing because it was not else if.
                            // Anyway, I chose the switch statement.
                            switch(row.Key){
                                case "EQP_ID":
                                    output.EqpId=row.Value;
                                    break;
                                case "LOT_ID":
                                    output.LotId=row.Value;
                                    break;
                                case "WAFER_ID":
                                    output.WaferId =row.Value;
                                    break;
                                case "S_DATE":
                                    output.SDate=row.Value;
                                    break;
                                case "TempBRINECoolant":
                                    output.TempBRINECoolant=row.Value;
                                    break;
                            }
                        }
                        return output;
                    }
                });
        }

        static void Main() {
            using(varsw=new StreamWriter(@"C:\test\list1.csv"))
            using(var writer=new CsvWriter(sw)) {
                writer.Configuration.HasHeaderRecord=false;
                varrows = Read()
                    .GroupBy(r=>r.EqpId.Substring(0,4))
                    .Where(g=>g.Count()>1)
                    .SelectMany(g=>g);
                writer.WriteRecords (rows);
            }
        }
    }
}

What do you think?


2022-09-30 21:39

Incidentally, although CsvReader is not included, there are several reading speed comparison articles below.
[C#]File Read Rate Comparison

It's like riding a horse, but if you try @sayuri's static Output Convert(), which is equivalent to Perl, it might be a little faster.(It will break in the middle, so it may have a negative effect.)but)

private static readonly encoding Enc=Encoding.GetEncoding("shift_jis");

    static Output Convert (string fileName)
    {
        var output = new Output();
        int dataflags = 0;
        foreach(var line in File.ReadLines(fileName,Enc))
        {
            string [ ] rec = line.Split(', ');
            string value = rec[3];
            switch(rec[1])
            {
                case "EQP_ID":
                    output.EqpId=value;
                    dataflags | = 0x01;
                    break;
                case "LOT_ID":
                    output.LotId=value;
                    dataflags | = 0x02;
                    break;
                case "WAFER_ID":
                    output.WaferId=value;
                    dataflags|=0x04;
                    break;
                case "S_DATE":
                    output.SDate=value.Substring(0,16);
                    dataflags | = 0x08;
                    break;
            }
            if(dataflags==0x0F)
            {
                break;
            }
        }
        return output;
    }


2022-09-30 21:39

If you have any answers or tips


© 2024 OneMinuteCode. All rights reserved.