Skip to content

TextFragmentAbsorber cant match phone by pattern #55

@liurupeng821

Description

@liurupeng821

Text in pdf contains embedded fonts,so I can't match the phone by pattern.

https://lagou-zhaopin-fe.lagou.com/activities/20221229/1672295126482.pdf


public static final String PHONE_REG = "(?:(?:1[-\\s]*[3456789][-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1})|(?:0[1-9]\\d{1,2}[-\\s]*\\d{7,8}))(?!\\d)";
public static void main(String[] args) throws Exception {
    byte[] source = FileUtils.readFileToByteArray(new File("/1672295126482.pdf"));
    if (!getLicense()) {
        throw new Exception("com.aspose.pdf lic ERROR!");
    }
    try (ByteArrayInputStream searchInputStream = new ByteArrayInputStream(source); ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
        Document pdfDoc = new Document(searchInputStream);

        TextSearchOptions textSearchOptions = new TextSearchOptions(true);
        TextEditOptions textEditOptions = new TextEditOptions(0, TextEditOptions.LanguageTransformation.class);
        TextFragmentAbsorber phoneTextFragmentAbsorber = new TextFragmentAbsorber(
                PHONE_REG,
                textSearchOptions,
                textEditOptions);

        PageCollection pages = pdfDoc.getPages();
        Page page = pages.get_Item(1);
        page.accept(phoneTextFragmentAbsorber);

        for (TextFragment textFragment : phoneTextFragmentAbsorber.getTextFragments()) {
            String text = textFragment.getText();
            logger.info("phone: " + text);
        }

    } catch (Exception e) {
        e.printStackTrace();
    }
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions