scipaper_classifier/reading_pdf.py at main · rizarae-p/scipaper_classifier · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import os
import fitz
from parse import tokenize_and_match

def check_deeplabcut_citation(pdf_path):
    """
    Check if the Methodology, Materials and Methods, or Results section of a PDF document contains the term "DeepLabCut".

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        bool: True if "DeepLabCut" is mentioned in the Methodology, Materials and Methods, or Results, False otherwise.
    """
    if "Supplementary" in pdf_path:
        return False
    print(f"Reading {pdf_path}...")
    full_text = ""
    sections_for_checking = ["Methodology","Materials and Methods","Results","Methods"]

    with fitz.open(pdf_path) as pdf_document:
        toc = pdf_document.get_toc()
        if len(toc) == 0 or all(entry[2] == 0 for entry in toc):
            print("No table of contents found, reading full document.")
            full_text = pdf_document.load_page(0).get_text("text")
            for page_num in range(len(pdf_document)):
                page_text = pdf_document.load_page(page_num).get_text("text")
                full_text += "\n" + page_text
        else:
            print("Table of contents found.")
            sections = {}
            current_section = None
            for toc_entry in toc:
                level, title, page_num = toc_entry
                page_num -= 1
                if page_num < 0:
                    continue
                for section in sections_for_checking:
                    if section in title:
                        if current_section is not None:
                            sections[current_section]['end'] = page_num
                        current_section = section
                        sections[current_section] = {'start': page_num, 'end': None}
                        break
            if current_section is not None:
                sections[current_section]['end'] = len(pdf_document)

            for sec_name, pages in sections.items():
                start, end = pages['start'], pages['end']
                for page_num in range(start, end):
                    page_text = pdf_document.load_page(page_num).get_text("text")
                    full_text += "\n" + page_text

    deeplabcut_cited = "DeepLabCut" in full_text
    return deeplabcut_cited, full_text

def analyze_papers(directory):
    """
    Analyze the PDF files in a directory to count the number of papers that cite DeepLabCut
    and the number of papers that do not cite DeepLabCut.

    Args:
        directory (str): The path to the directory containing the PDF files.

    Returns:
        tuple: A tuple containing four elements:
            - int: Number of papers citing DeepLabCut.
            - int: Number of papers not mentioning DeepLabCut.
            - dict: A dictionary containing filenames of papers with citations to DeepLabCut.
            - dict: A dictionary containing filenames of papers without citations to DeepLabCut.
    """
    papers_with_deeplabcut = 0
    papers_without_deeplabcut = 0

    papers_with_deeplabcut_dict = {}
    papers_without_deeplabcut_dict = {}

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory, filename)
            deeplabcut_cited = check_deeplabcut_citation(file_path)
            if deeplabcut_cited:
                papers_with_deeplabcut += 1
                papers_with_deeplabcut_dict[filename] = file_path
            else:
                papers_without_deeplabcut += 1
                papers_without_deeplabcut_dict[filename] = file_path

    return papers_with_deeplabcut, papers_without_deeplabcut, papers_with_deeplabcut_dict, papers_without_deeplabcut_dict

def extract_text_from_pdf(pdf_path):
    """
    Extract full text from a PDF document.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The full text extracted from the PDF.
    """
    full_text = ""
    print(f"Reading {pdf_path}...")

    with fitz.open(pdf_path) as pdf_document:
        for page_num in range(len(pdf_document)):
            page_text = pdf_document.load_page(page_num).get_text("text")
            full_text += "\n" + page_text

    return full_text

def get_animals_from_papers(paper_paths):
    """
    Extract a list of animals used in each paper that cites DeepLabCut.

    Args:
        paper_paths (dict): A dictionary where keys are paper titles and values are paths to the papers.

    Returns:
        dict: A dictionary where keys are paper titles and values are lists of animals used in each paper.
    """
    animals_in_papers = {}

    for title, pdf_path in paper_paths.items():
        print(f"Processing {pdf_path}...")
        full_text = extract_text_from_pdf(pdf_path)
        animals = tokenize_and_match(full_text)
        animals_in_papers[title] = animals

    return animals_in_papers


# start = False
    # for page_num in range(1,len(pdf_document)):
    #     page = pdf_document[page_num]
    #     page_text = page.get_text("blocks")
    #     for idx,m in enumerate(page_text):
    #         print(idx,m)
    #     break
        # for col in page_text:
        #     print(col[0])
        #     if not(issubclass(type(col), str)):
        #         continue
        #     else:
        #         print(col)
        #         for section in sections_for_checking:
        #             if f"{section}\n" in col:
        #                 full_text+=col
    #             text_after = page_text.split(f"\n{section}\n")
    #             start = True
    #             full_text += text_after[1]
    #         if start:
    #             full_text += page_text
    #         if "Conclusion" in page_text:
    #             start = False
    # print(full_text)


import os
import fitz
from parse import tokenize_and_match

def extract_section_text(pdf_path, section_name):
    """
    Extract text from a specified section of a PDF document.

    Args:
        pdf_path (str): The path to the PDF file.
        section_name (str): The name of the section to extract.

    Returns:
        str: The extracted text from the specified section.
    """
    full_text = ""
    with fitz.open(pdf_path) as pdf_document:
        toc = pdf_document.get_toc()
        if len(toc) == 0 or all(entry[2] == 0 for entry in toc):
            print("No table of contents found, reading full document.")
            for page_num in range(len(pdf_document)):
                page_text = pdf_document.load_page(page_num).get_text("text")
                full_text += "\n" + page_text
        else:
            print("Table of contents found.")
            sections = {}
            current_section = None
            for toc_entry in toc:
                level, title, page_num = toc_entry
                page_num -= 1
                if page_num < 0:
                    continue
                if section_name in title:
                    if current_section is not None:
                        sections[current_section]['end'] = page_num
                    current_section = section_name
                    sections[current_section] = {'start': page_num, 'end': None}
                elif current_section is not None:
                    sections[current_section]['end'] = page_num
                    current_section = None
            if current_section is not None:
                sections[current_section]['end'] = len(pdf_document)

            if section_name in sections:
                start, end = sections[section_name]['start'], sections[section_name]['end']
                for page_num in range(start, end):
                    page_text = pdf_document.load_page(page_num).get_text("text")
                    full_text += "\n" + page_text

    return full_text

def get_animals_from_abstract(pdf_path):
    """
    Extract a list of animals mentioned in the Abstract section of a PDF document.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        list: A list of animals mentioned in the Abstract section.
    """
    abstract_text = extract_section_text(pdf_path, "Abstract")
    animals = tokenize_and_match(abstract_text)
    return animals

def analyze_papers_from_abstracts(papers_with_deeplabcut_dict):
    """
    Analyze the PDF files that cite DeepLabCut to extract and count mentions of animals in the Abstract section.

    Args:
        papers_with_deeplabcut_dict (dict): A dictionary containing filenames of papers with citations to DeepLabCut.

    Returns:
        dict: A dictionary where keys are paper titles and values are lists of animals mentioned in each paper.
    """
    animals_in_papers = {}

    for filename, file_path in papers_with_deeplabcut_dict.items():
        print(f"Processing {file_path}...")
        animals = get_animals_from_abstract(file_path)
        animals_in_papers[filename] = animals

    return animals_in_papers