Detect entities in a set of documents#

1raise SystemExit("Stop right there!");
An exception has occurred, use %tb to see the full traceback.

SystemExit: Stop right there!
 1# import the AWS SDK for python (boto3) -
 2# http://boto3.readthedocs.io/en/latest/
 3import boto3
 4
 5# import json module to serialize JSON -
 6# https://docs.python.org/3.6/library/json.html
 7import json
 8
 9# import glob to find text files with .txt ending -
10# https://docs.python.org/3.6/library/glob.html
11import glob
 1# Instantiating a new comprehend client
 2comprehend = boto3.client(service_name="comprehend")
 3
 4# Works for Linux, OSX. Change to \\ for windows.
 5data_dir = "objects/*.txt"
 6files = glob.glob(data_dir)
 7
 8for file in files:
 9    with open(file, "r", encoding="utf-8") as f:
10        file_as_str = f.read()
11        # python string formatting to print the text file name
12        print(
13            "Calling detect_entities_from_documents.py on file: %s"
14            % file[-15:]
15        )
16        # json.dumps() writes JSON data to a Python string
17        print(
18            json.dumps(
19                comprehend.detect_entities(
20                    Text=file_as_str, LanguageCode="en"
21                ),
22                sort_keys=True,
23                indent=4,
24            )
25        )
26        print("End of detect_entities\n")
Calling detect_entities_from_documents.py on file: pratchett_2.txt
{
    "Entities": [
        {
            "BeginOffset": 14,
            "EndOffset": 24,
            "Score": 0.6801316738128662,
            "Text": "Stonehenge",
            "Type": "TITLE"
        },
        {
            "BeginOffset": 69,
            "EndOffset": 78,
            "Score": 0.9977608919143677,
            "Text": "Microsoft",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 88,
            "EndOffset": 95,
            "Score": 0.9023981094360352,
            "Text": "Avebury",
            "Type": "TITLE"
        },
        {
            "BeginOffset": 114,
            "EndOffset": 119,
            "Score": 0.9812510013580322,
            "Text": "Apple",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 206,
            "EndOffset": 233,
            "Score": 0.8046249747276306,
            "Text": "about once every six months",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 275,
            "EndOffset": 290,
            "Score": 0.9184390902519226,
            "Text": "WordPerfect 4.2",
            "Type": "TITLE"
        },
        {
            "BeginOffset": 320,
            "EndOffset": 340,
            "Score": 0.6904553174972534,
            "Text": "about once every two",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 344,
            "EndOffset": 355,
            "Score": 0.9218593835830688,
            "Text": "three years",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 484,
            "EndOffset": 488,
            "Score": 0.6738322973251343,
            "Text": "Mort",
            "Type": "TITLE"
        },
        {
            "BeginOffset": 507,
            "EndOffset": 509,
            "Score": 0.9828040599822998,
            "Text": "UK",
            "Type": "LOCATION"
        },
        {
            "BeginOffset": 555,
            "EndOffset": 559,
            "Score": 0.9985629916191101,
            "Text": "Hugh",
            "Type": "PERSON"
        },
        {
            "BeginOffset": 563,
            "EndOffset": 567,
            "Score": 0.9947370886802673,
            "Text": "Emma",
            "Type": "PERSON"
        },
        {
            "BeginOffset": 585,
            "EndOffset": 594,
            "Score": 0.9137960076332092,
            "Text": "Sheffield",
            "Type": "PERSON"
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "1308",
            "content-type": "application/x-amz-json-1.1",
            "date": "Mon, 28 Mar 2022 00:31:08 GMT",
            "x-amzn-requestid": "61f26645-ae3d-4f6e-8ec2-5287709eca9d"
        },
        "HTTPStatusCode": 200,
        "RequestId": "61f26645-ae3d-4f6e-8ec2-5287709eca9d",
        "RetryAttempts": 0
    }
}
End of detect_entities

Calling detect_entities_from_documents.py on file: pratchett_1.txt
{
    "Entities": [
        {
            "BeginOffset": 78,
            "EndOffset": 101,
            "Score": 0.9171451330184937,
            "Text": "more than three letters",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 189,
            "EndOffset": 201,
            "Score": 0.8697581887245178,
            "Text": "last Tuesday",
            "Type": "DATE"
        },
        {
            "BeginOffset": 266,
            "EndOffset": 271,
            "Score": 0.3802641034126282,
            "Text": "Three",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 272,
            "EndOffset": 283,
            "Score": 0.8991808295249939,
            "Text": "Mile Island",
            "Type": "LOCATION"
        },
        {
            "BeginOffset": 356,
            "EndOffset": 364,
            "Score": 0.7091284990310669,
            "Text": "Thousand",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 365,
            "EndOffset": 371,
            "Score": 0.42392757534980774,
            "Text": "Island",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 411,
            "EndOffset": 417,
            "Score": 0.5102078318595886,
            "Text": "Bognor",
            "Type": "PERSON"
        },
        {
            "BeginOffset": 460,
            "EndOffset": 467,
            "Score": 0.9445776343345642,
            "Text": "English",
            "Type": "OTHER"
        },
        {
            "BeginOffset": 753,
            "EndOffset": 774,
            "Score": 0.9797861576080322,
            "Text": "two hundred years ago",
            "Type": "DATE"
        },
        {
            "BeginOffset": 964,
            "EndOffset": 973,
            "Score": 0.9959953427314758,
            "Text": "100 miles",
            "Type": "QUANTITY"
        },
        {
            "BeginOffset": 1170,
            "EndOffset": 1183,
            "Score": 0.9878700971603394,
            "Text": "New Jerusalem",
            "Type": "LOCATION"
        },
        {
            "BeginOffset": 1236,
            "EndOffset": 1238,
            "Score": 0.9451072812080383,
            "Text": "UK",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 1259,
            "EndOffset": 1279,
            "Score": 0.7746501564979553,
            "Text": "past couple of years",
            "Type": "DATE"
        },
        {
            "BeginOffset": 1337,
            "EndOffset": 1350,
            "Score": 0.8914646506309509,
            "Text": "Lord Vetinari",
            "Type": "PERSON"
        },
        {
            "BeginOffset": 1419,
            "EndOffset": 1431,
            "Score": 0.9574230909347534,
            "Text": "Elizabethans",
            "Type": "PERSON"
        },
        {
            "BeginOffset": 1526,
            "EndOffset": 1533,
            "Score": 0.9814733862876892,
            "Text": "English",
            "Type": "OTHER"
        },
        {
            "BeginOffset": 1567,
            "EndOffset": 1581,
            "Score": 0.5789746046066284,
            "Text": "at least three",
            "Type": "QUANTITY"
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "content-length": "1748",
            "content-type": "application/x-amz-json-1.1",
            "date": "Mon, 28 Mar 2022 00:31:08 GMT",
            "x-amzn-requestid": "79874e3d-324e-4322-a4b5-9e38d722b60a"
        },
        "HTTPStatusCode": 200,
        "RequestId": "79874e3d-324e-4322-a4b5-9e38d722b60a",
        "RetryAttempts": 0
    }
}
End of detect_entities