Merge all lines that are identical aside from a ke

2020-05-11 02:34发布

host1 input nic1 ip1 ip2 PROT 30000 10 host1 input nic1 ip1 ip2 PROT 40000 10 host1 input nic1 ip1 ip2 PROT 50000 10 host1 input nic1 ip1 ip2 PROT 60000 10 host1 input nic1 ip3 ip2 PROT 10 30000 host1 input nic1 ip3 ip2 PROT 10 40000 host1 input nic1 ip3 ip2 PROT 10 50000 host1 input nic1 ip3 ip2 PROT 10 60000 host1 output nic1 ip2 ip1 PROT 10 30000 host1 output nic1 ip2 ip1 PROT 10 40000 host1 output nic1 ip2 ip1 PROT 10 50000 host1 output nic1 ip2 ip1 PROT 10 60000 host1 output nic1 ip2 ip3 PROT 30000 10 host1 output nic1 ip2 ip3 PROT 40000 10 host1 output nic1 ip2 ip3 PROT 50000 10 host1 output nic1 ip2 ip3 PROT 60000 10 host1 output loc ip2 ip2 PROT 10 30000 host1 output loc ip2 ip2 PROT 10 50000

host1 input nic1 ip1 ip2 PROT 30000:60000 10 host1 input nic1 ip3 ip2 PROT 10 30000:60000 host1 output nic1 ip2 ip1 PROT 10 30000:60000 host1 output nic1 ip2 ip3 PROT 30000:60000 10 host1 output loc ip2 ip2 PROT 10 30000:50000

Update

I have refactored the code in the answer below so as to make it more readable. The main body should read almost English prose.

#!/usr/bin/awk -f
# main body
NR == 1 {
  copyRecordTo(veryold)
  next
}
{
  if (inSameGroup()) {
    copyRecordTo(old)
  } else {
    makeRangeForField(NF - 1)
    makeRangeForField(NF)
    nicePrint()
    copyRecordTo(veryold)
  }
}
END {
  makeRangeForField(NF - 1)
  makeRangeForField(NF)
  nicePrint()
}

# functions
function copyRecordTo(line) {
  for (i = 1; i <= NF; ++i) line[i] = $i
}
function nicePrint() {
  for (i = 1; i <= NF; ++i) {
    i == NF - 1 ? fmt = "%s\t\t" : fmt = "%s\t"
    printf(fmt, old[i])
  }
  printf("\n")
}
function makeRangeForField(f) {
  if (old[f] != veryold[f])
    old[f] = veryold[f]":"old[f]
}
function inSameGroup() {
  b = 1
  for (i = 1; i <= NF - 2; ++i)
    b *= $i == veryold[i]
  return b == 1
}

Original answer

The following awk script generates almost what you are looking for.

Essentially the script does the following:

stores in veryold the first line of each set of lines that differ only for the 7th and/or 8th filed
stores in old the last read line
the "boolean" b is used to check when that last line is surpassed
when this happens the last two fields of veryold are joined with those of old with a : in between if they are different, and old is printed
one more tab \t is used between the last two fields to improve readability

Other two points:

NR == 1 is a special case that has to initialize veryold only
after the last line is read END handles the special case of the last line stored in old

#!/usr/bin/awk -f
NR == 1 {
  for (i = 2; i <= NF; ++i) {
    veryold[i] = $i
  }
  next
}
{
  b = 1
  for (i = 2; i <= NF - 2; ++i) {
    b *= $i == veryold[i]
  }
  if (b == 1) {
    for (i = 1; i <= NF; ++i) {
      old[i] = $i
    }
  } else {
    if (old[NF - 1] != veryold[NF - 1]) {
      old[NF - 1] = veryold[NF - 1]":"old[NF - 1]
    }
    if (old[NF] != veryold[NF]) {
      old[NF] = veryold[NF]":"old[NF]
    }
    for (i = 1; i <= NF; ++i) {
      if (i == NF - 1) {
        fmt = "%s\t\t"
      } else {
        fmt = "%s\t"
      }
      printf(fmt, old[i])
    }
    printf("\n")
    for (i = 2; i <= NF; ++i) {
      veryold[i] = $i
    }
  }
}
END {
  if (old[NF - 1] != veryold[NF - 1]) {
    old[NF - 1] = veryold[NF - 1]":"old[NF - 1]
  }
  if (old[NF] != veryold[NF]) {
    old[NF] = veryold[NF]":"old[NF]
  }
  for (i = 1; i <= NF; ++i) {
    if (i == NF - 1) {
      fmt = "%s\t\t"
    } else {
      fmt = "%s\t"
    }
    printf(fmt, old[i])
  }
}