Hopefully someone will help you with an answer that uses an XML-aware tool but if not and assuming your input really does look like the sample you provided - using GNU awk for sorted_in
:
$ cat tst.awk
BEGIN { RS=""; ORS="\n\n"; FS="</?date>" }
{
split($2,d,/[, ]+/)
mthAbbr = substr(d[1],1,3)
mthNr = ( index( "JanFebMarAprMayJunJulAugSepOcNovDec", mthAbbr ) + 2 ) / 3
date = sprintf("%04d%02d%02d",d[3], mthNr, d[2])
items[date] = $0
}
END {
PROCINFO["sorted_in"] = "@ind_num_desc"
for ( date in items ) {
print items[date]
}
}
$ awk -f tst.awk file
<item>
<date>August 24, 2021</date>
<p>Text</p>
</item>
<item>
<date>July 20, 2021</date>
<p>some text</p>
</item>
<item>
<date>February 11, 2020</date>
<p>more text</p>
</item>
or using any awk plus sort and cut:
$ cat tst.awk
BEGIN { RS=""; FS="\n"; OFS="\t" }
{
split($2,d,/[<>, ]+/)
mthAbbr = substr(d[3],1,3)
mthNr = ( index( "JanFebMarAprMayJunJulAugSepOcNovDec", mthAbbr ) + 2 ) / 3
date = sprintf("%04d%02d%02d",d[5], mthNr, d[4])
for (i=1; i<=NF; i++) {
print date, NR, i, $i
}
print date, NR, i, ""
}
$ awk -f tst.awk file | sort -k1,1rn -k2,3n | cut -f4-
<item>
<date>August 24, 2021</date>
<p>Text</p>
</item>
<item>
<date>July 20, 2021</date>
<p>some text</p>
</item>
<item>
<date>February 11, 2020</date>
<p>more text</p>
</item>
The 2nd one will be a better choice if your input file is huge since it doesn't require awk to hold the whole input file in memory before printing it. It works by decorating the input lines to add the date for each item
followed by the current record (item
) number followed by the current line number within that item
so that sort
can then sort by date
but retain the original input order even for duplicate dates, and then cut
just removes the decorations that the first awk
added to facilitate sorting. Here's what the output from the first 2 steps looks like so you can see what they do:
$ awk -f tst.awk file
20210824 1 1 <item>
20210824 1 2 <date>August 24, 2021</date>
20210824 1 3 <p>Text</p>
20210824 1 4 </item>
20210824 1 5
20200211 2 1 <item>
20200211 2 2 <date>February 11, 2020</date>
20200211 2 3 <p>more text</p>
20200211 2 4 </item>
20200211 2 5
20210720 3 1 <item>
20210720 3 2 <date>July 20, 2021</date>
20210720 3 3 <p>some text</p>
20210720 3 4 </item>
20210720 3 5
$ awk -f tst.awk file | sort -k1,1rn -k2,3n
20210824 1 1 <item>
20210824 1 2 <date>August 24, 2021</date>
20210824 1 3 <p>Text</p>
20210824 1 4 </item>
20210824 1 5
20210720 3 1 <item>
20210720 3 2 <date>July 20, 2021</date>
20210720 3 3 <p>some text</p>
20210720 3 4 </item>
20210720 3 5
20200211 2 1 <item>
20200211 2 2 <date>February 11, 2020</date>
20200211 2 3 <p>more text</p>
20200211 2 4 </item>
20200211 2 5
awk
orsed
, and dedicated parsers such asxmlstarlet
should be used. Please indicate what you already tried and where you faced problems, so that contributors can help you find the solution to a specific question along the path. – AdminBee Aug 26 '21 at 10:17