pdf2json/test/_test_type3glyph.cjs at 9cf1b2a2c01cbe9df5f2ff1acf561abc98cd40ae · modesty/pdf2json · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
const fs = require('fs');
const path = require('path');
const PDFParser = require("../dist/pdfparser.cjs");

describe('Type3 Glyph Font Tests', () => {
	let pdfParser;
	const testPdfPath = path.join(__dirname, 'pdf/misc/i389_type3_glyph.pdf');
	const outputDir = path.join(__dirname, 'target/misc');
	const jsonOutputPath = path.join(outputDir, 'i389_type3_glyph.json');
	const contentOutputPath = path.join(outputDir, 'i389_type3_glyph.content.txt');

	beforeEach(() => {
		pdfParser = new PDFParser(null, 1);
	});

	afterEach(() => {
		if (pdfParser) {
			pdfParser.destroy();
		}
	});

	test('should successfully parse Type3 glyph font PDF', async () => {
		return new Promise((resolve, reject) => {
			// Set up event handlers
			pdfParser.on('pdfParser_dataError', (errData) => {
				reject(new Error(`PDF parsing failed: ${errData.parserError}`));
			});

			pdfParser.on('pdfParser_dataReady', (pdfData) => {
				try {
					// Basic structure assertions
					expect(pdfData).toBeDefined();
					expect(pdfData.Pages).toBeDefined();
					expect(pdfData.Pages.length).toBe(1);
					const page = pdfData.Pages[0];
					expect(page.Texts).toBeDefined();
					expect(page.Texts.length).toBe(2); // Should have both Type3 and regular text

					// Check for Type3 text "CONTENT"
					const type3Text = page.Texts.find(text =>
						text.R && text.R[0] && text.R[0].T === 'CONTENT'
					);
					expect(type3Text).toBeDefined();
					expect((type3Text.R[0].T)).toBe('CONTENT');

					// Check for regular text "Added Text from Acrobat"
					const regularText = page.Texts.find(text =>
						text.R && text.R[0] && text.R[0].T === 'Added Text from Acrobat'
					);
					expect(regularText).toBeDefined();
					expect(regularText.R[0].T).toBe('Added Text from Acrobat');

					console.log('✓ Type3 glyph font parsing successful');
					console.log(`✓ Found Type3 text: "${type3Text.R[0].T}"`);
					console.log(`✓ Found regular text: "${regularText.R[0].T}"`);

					resolve();
				} catch (error) {
					reject(error);
				}
			});

			// Load and parse the PDF
			pdfParser.loadPDF(testPdfPath);
		});
	}, 30000); // 30 second timeout

	test('should generate correct output files with both texts', async () => {
		// Ensure output directory exists
		if (!fs.existsSync(outputDir)) {
			fs.mkdirSync(outputDir, { recursive: true });
		}

		return new Promise((resolve, reject) => {
			pdfParser.on('pdfParser_dataError', (errData) => {
				reject(new Error(`PDF parsing failed: ${errData.parserError}`));
			});

			pdfParser.on('pdfParser_dataReady', (pdfData) => {
				try {
					// Write JSON output
					const jsonOutput = JSON.stringify(pdfData, null, 2);
					fs.writeFileSync(jsonOutputPath, jsonOutput);

					// Write content output
					let contentOutput = '';
					pdfData.Pages.forEach((page, pageIndex) => {
						page.Texts.forEach(text => {
							if (text.R) {
								text.R.forEach(run => {
									contentOutput += run.T + '\n';
								});
							}
						});
						contentOutput += `----------------Page (${pageIndex}) Break----------------\n`;
					});
					fs.writeFileSync(contentOutputPath, contentOutput);

					// Verify JSON file exists and contains both texts
					expect(fs.existsSync(jsonOutputPath)).toBe(true);
					const jsonContent = fs.readFileSync(jsonOutputPath, 'utf8');
					const parsedJson = JSON.parse(jsonContent);

					expect(parsedJson.Pages[0].Texts.length).toBe(2);
					expect(jsonContent).toContain('CONTENT');
					expect(jsonContent).toContain('Added Text from Acrobat');

					// Verify content file exists and contains both texts
					expect(fs.existsSync(contentOutputPath)).toBe(true);
					const contentFileContent = fs.readFileSync(contentOutputPath, 'utf8');
					expect(contentFileContent).toContain('CONTENT');
					expect(contentFileContent).toContain('Added Text from Acrobat');

					console.log('✓ JSON output file created successfully');
					console.log('✓ Content output file created successfully');
					console.log('✓ Both files contain expected Type3 and regular text');

					resolve();
				} catch (error) {
					reject(error);
				}
			});

			// Load and parse the PDF
			pdfParser.loadPDF(testPdfPath);
		});
	}, 30000); // 30 second timeout

	test('should handle Type3 font metadata correctly', async () => {
		return new Promise((resolve, reject) => {
			pdfParser.on('pdfParser_dataError', (errData) => {
				reject(new Error(`PDF parsing failed: ${errData.parserError}`));
			});

			pdfParser.on('pdfParser_dataReady', (pdfData) => {
				try {
					const page = pdfData.Pages[0];

					// Find Type3 text
					const type3Text = page.Texts.find(text =>
						text.R && text.R[0] && text.R[0].T === 'CONTENT'
					);

					// Verify Type3 text has proper positioning
					expect(type3Text.x).toBeDefined();
					expect(type3Text.y).toBeDefined();
					expect(typeof type3Text.x).toBe('number');
					expect(typeof type3Text.y).toBe('number');

					// Verify text run structure
					expect(type3Text.R).toBeDefined();
					expect(type3Text.R.length).toBe(1);
					expect(type3Text.R[0].T).toBe('CONTENT');
					expect(type3Text.R[0].S).toBeDefined(); // Style index
					expect(type3Text.R[0].TS).toBeDefined(); // Text style array

					console.log('✓ Type3 font metadata validation successful');
					console.log(`✓ Type3 text position: (${type3Text.x}, ${type3Text.y})`);
					console.log(`✓ Type3 text style: S=${type3Text.R[0].S}, TS=[${type3Text.R[0].TS.join(',')}]`);

					resolve();
				} catch (error) {
					reject(error);
				}
			});

			// Load and parse the PDF
			pdfParser.loadPDF(testPdfPath);
		});
	}, 30000); // 30 second timeout
});