Skip to content

Commit 7560dec

Browse files
+ PAC: improve the count of max characters per line
Handling of non printable characters, italic markers, mix of 7-bit characters and 16-bit characters
1 parent 017d0fa commit 7560dec

File tree

1 file changed

+147
-92
lines changed

1 file changed

+147
-92
lines changed

Source/MediaInfo/Text/File_Pac.cpp

+147-92
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,15 @@ void Pac_Convert(Ztring& ToConvert, const codepage& CodePage) {
5454
}
5555
}
5656

57+
//---------------------------------------------------------------------------
58+
enum Pac_format {
59+
Format_8bit,
60+
Format_Skip3,
61+
Format_Skip4,
62+
Format_Utf8,
63+
Format_16bit,
64+
};
65+
5766
//***************************************************************************
5867
// Constructor/Destructor
5968
//***************************************************************************
@@ -253,107 +262,151 @@ void File_Pac::Data_Parse()
253262
}
254263
}
255264
ZtringList Content;
256-
while (Element_Size - Element_Offset >= 1) {
257-
int64u Current;
258-
if (!Element_Code) {
259-
int8u Start;
260-
Peek_L1(Start);
261-
if (Start != 0xFE) {
262-
Trusted_IsNot("0xFE");
263-
Skip_XX(Element_Size-Element_Offset, "(Unknown)");
264-
break;
265-
}
266-
Skip_L1( "0xFE");
267-
Skip_L1( "Horizontal alignment");
268-
Skip_L1( "(Unknown)");
269-
for (Current = Element_Offset; Current < Element_Size; Current++) {
270-
if (Buffer[Buffer_Offset + Current] == 0xFE) {
265+
auto NextPartIsStart = true;
266+
auto NextPartFormat = Format_8bit;
267+
while (Element_Offset < Element_Size) {
268+
if (NextPartIsStart) {
269+
if (!Element_Code) {
270+
int8u Start;
271+
Peek_L1(Start);
272+
auto Size = Element_Size - Element_Offset;
273+
if (Start != 0xFE || Size <= 2) {
274+
Trusted_IsNot("0xFE");
275+
Skip_XX(Size, "(Unknown)");
271276
break;
272277
}
278+
Skip_L1( "Line start marker");
279+
Skip_L1( "Horizontal alignment");
280+
Skip_L1( "(Unknown)");
273281
}
282+
Content.resize(Content.size() + 1);
283+
NextPartIsStart = false;
274284
}
275-
else {
276-
Current = Element_Size;
277-
}
278-
Content.resize(Content.size() + 1);
279-
auto Size = Current - Element_Offset;
280-
bool IsUtf8 = false;
281-
bool IsW16 = false;
282-
while (Size) {
283-
if (Size > 2) {
284-
size_t Pos = Buffer_Offset + (size_t)Element_Offset;
285-
if (Buffer[Pos] == 0x1F && Buffer[Pos + 1] == 'C' && Buffer[Pos + 2] >= '0' && Buffer[Pos + 2] <= '9') {
286-
Skip_B3( "(Unknown)");
287-
Size -= 3;
288-
}
289-
}
290-
if (Size >= 5) {
291-
int64u Probe;
292-
Peek_B5(Probe);
293-
if (Probe >> 8 == 0x1FEFBBBF) {
294-
Skip_B4( "UTF-8 start");
295-
Size -= 4;
296-
IsUtf8 = true;
297-
}
298-
if (Probe == 0x1F5731362ELL) {
299-
Skip_B5( "W16 start");
300-
Size -= 5;
301-
IsW16 = true;
302-
}
303-
}
304-
if (IsUtf8) {
305-
int64u End = Element_Offset;
306-
for (; End < Element_Size; End++) {
307-
const auto Data = Buffer[Buffer_Offset + End];
308-
if (Data == 0x2E || Data == 0xFF) {
285+
286+
auto Element_Middle = Element_Offset;
287+
auto Format = NextPartFormat;
288+
NextPartFormat = Format_8bit;
289+
for (Element_Middle = Element_Offset; Element_Middle < Element_Size; Element_Middle += Format == Format_16bit ? 2 : 1) {
290+
const auto Value = Buffer[Buffer_Offset + Element_Middle];
291+
if (Value == 0x1F) {
292+
auto Buffer_Current = Buffer + Buffer_Offset + (size_t)Element_Middle + 1;
293+
auto NextPartMaxSize = Element_Size - Element_Middle;
294+
if (NextPartMaxSize >= 3) {
295+
auto Probe = BigEndian2int16u(Buffer_Current);
296+
if (Probe >= 0x4330 && Probe <= 0x4339) { // C0 - C9
297+
NextPartFormat = Format_Skip3;
309298
break;
310299
}
311300
}
312-
Get_UTF8(End - Element_Offset, Content.back(), "Content");
313-
if (Element_Offset < Element_Size) {
314-
if (Buffer[Buffer_Offset + Element_Offset] == 0xFF) {
315-
Skip_L1( "Dot");
316-
Content.back() += __T('.');
301+
if (NextPartMaxSize > 3) {
302+
auto Probe = BigEndian2int24u(Buffer_Current);
303+
if (Probe >= 0x522E30 && Probe <= 0x522E39) { // R.0 - R.9
304+
NextPartFormat = Format_Skip4;
305+
break;
306+
}
307+
if (Probe == 0xEFBBBF) {
308+
NextPartFormat = Format_Utf8;
309+
break;
317310
}
318-
else { // 0x2E
319-
Skip_B1( "UTF-8 end");
320-
IsUtf8 = false;
311+
}
312+
if (NextPartMaxSize >= 5) {
313+
auto Probe = BigEndian2int32u(Buffer_Current);
314+
if (Probe == 0x5731362E) { // W16.
315+
NextPartFormat = Format_16bit;
316+
break;
321317
}
322-
Size -= 1;
323318
}
324-
Count_UTF8++;
325319
}
326-
else if (IsW16) {
327-
Skip_XX(Size, "Content");
328-
Content.back().resize(Size / 2);
329-
Count_2byte++;
320+
if (Value == 0xFE && !Element_Code) {
321+
NextPartIsStart = true;
322+
break;
330323
}
331-
else {
332-
Get_ISO_8859_1(Size, Content.back(), "Content");
333-
bool Is8bit = false;
334-
for (const auto& Character : Content.back()) {
335-
if ((wchar_t)Character >= 0x80) {
336-
Is8bit = true;
337-
}
324+
}
325+
326+
// Current
327+
auto Size = Element_Middle - Element_Offset;
328+
switch (Format) {
329+
case Format_8bit: {
330+
Ztring Value;
331+
Get_ISO_8859_1(Size, Value, "Content");
332+
bool Is8bit = false;
333+
for (const auto& Character : Value) {
334+
if ((unsigned)Character >= 0x80) {
335+
Is8bit = true;
338336
}
339-
if (Is8bit) {
340-
Count_1byte8++;
341-
auto& Line = Content.back();
342-
if (Line.size() < numeric_limits<int>::max()) {
343-
int Max = (int)Line.size();
344-
for (int i = Max; i > 0; i--) {
345-
const auto Character = Line[i];
346-
if ((wchar_t)Character >= 0xE0) {
347-
Line.erase(i, 1); // Not supported but we currently don't need the real text
348-
}
349-
}
337+
}
338+
if (Is8bit) {
339+
Count_1byte8++;
340+
for (auto i = Value.size() - 1; i; --i) {
341+
const auto Character = Value[i];
342+
if ((unsigned)Character >= 0xE0) {
343+
Value.erase(i, 1); // Content reading not supported but no need for counting
350344
}
351345
}
352-
else {
353-
Count_1byte7++;
346+
}
347+
else if (Size) {
348+
Count_1byte7++;
349+
}
350+
Content.back() += Value;
351+
break;
352+
}
353+
case Format_Skip4:
354+
case Format_Skip3: {
355+
break;
356+
}
357+
case Format_Utf8: {
358+
auto Element_UTF8End = Element_Offset;
359+
for (; Element_UTF8End < Element_Middle; ++Element_UTF8End) {
360+
const auto Data = Buffer[Buffer_Offset + Element_UTF8End];
361+
if (Data == 0x2E || Data == 0xFF) {
362+
Ztring Value;
363+
Get_UTF8(Element_UTF8End - Element_Offset, Value, "Content");
364+
Content.back() += Value;
365+
if (Data == 0xFF) {
366+
Skip_L1( "Dot");
367+
Content.back() += __T('.');
368+
continue;
369+
}
370+
Skip_C1( "UTF-8 end");
371+
NextPartFormat = Format_8bit;
354372
}
355373
}
356-
Size = Current - Element_Offset;
374+
Count_UTF8++;
375+
break;
376+
}
377+
case Format_16bit: {
378+
auto Current = (int16u*)(Buffer + Buffer_Offset + (size_t)Element_Offset);
379+
auto End = Current + Size / 2;
380+
for (; Current < End; ++Current) {
381+
Content.back() += (wchar_t)*Current; // Content reading not supported but no need for counting, we need only the values < 0x80
382+
}
383+
Skip_XX(Size, "W16 content");
384+
Count_2byte++;
385+
break;
386+
}
387+
}
388+
389+
// Next
390+
switch (NextPartFormat) {
391+
case Format_8bit: {
392+
break;
393+
}
394+
case Format_Skip3: {
395+
Skip_C3( "(Unknown)");
396+
break;
397+
}
398+
case Format_Skip4: {
399+
Skip_C4( "(Unknown)");
400+
break;
401+
}
402+
case Format_Utf8: {
403+
Skip_C4( "UTF-8 start");
404+
break;
405+
}
406+
case Format_16bit:{
407+
Skip_C5( "W16 start");
408+
break;
409+
}
357410
}
358411
}
359412

@@ -481,16 +534,18 @@ void File_Pac::Data_Parse()
481534
for (const auto& Line : Content) {
482535
auto CountOfCharsPerLine = Line.size();
483536
size_t Pos = 0;
484-
for (;;) {
485-
Pos = Line.find('<', Pos);
486-
if (Pos == string::npos) {
487-
break;
537+
bool ItalicBeginFound = false;
538+
for (const auto& Value : Line) {
539+
if (Value < 0x20) { // Non printable
540+
CountOfCharsPerLine--;
541+
}
542+
if (Value == '<') {
543+
ItalicBeginFound = true;
488544
}
489-
Pos = Line.find('>', Pos + 1);
490-
if (Pos == string::npos) {
491-
break;
545+
if (Value == '<') {
546+
CountOfCharsPerLine -= 2; // < and > are markers of italic
547+
ItalicBeginFound = false;
492548
}
493-
CountOfCharsPerLine += 2; // < and > are marker of italic
494549
}
495550
if (MaxCountOfCharsPerLine < CountOfCharsPerLine) {
496551
MaxCountOfCharsPerLine = CountOfCharsPerLine;

0 commit comments

Comments
 (0)