Professional Documents
Culture Documents
Build
Build
config
from elasticsearch import Elasticsearch
from neo4j import GraphDatabase
import re
import json
from uuid import uuid4
from elasticsearch import helpers
def get_es_client():
port = 9200
hosts = "10.1.129.78"
user = "yury"
password = "bnfjby_130688"
http_scheme = "https"
if http_scheme.lower() == 'https':
es = Elasticsearch(hosts, http_auth=(user, password), scheme=http_scheme,
port=port,
ca_certs=False, verify_certs=False, use_ssl=True)
else:
es = Elasticsearch(hosts, http_auth=(user, password), scheme=http_scheme,
port=port)
return es
def get_neo4j_client():
host = "bolt://10.1.129.78:7687"
user = "neo4j"
password = "neo"
client = GraphDatabase.driver(host, auth=(user, password))
return client
pass
es_index = "email-dnc-entities"
es_doc_type = "DNC"
scroll_timeout = '10m'
text_field = 'email.body'
emails_field = 'entities.identifier:email'
people_field = 'entities.person'
organizations_field = 'entities.organization'
query_body = {"query": {"bool": {"must": [{"exists": {"field": text_field}}]}}}
es = get_es_client()
neo = get_neo4j_client()
page = es.search(
index=es_index,
doc_type=es_doc_type,
scroll=scroll_timeout,
size=1000,
body=query_body)
sid = page['_scroll_id']
scroll_size = len(page['hits']['hits'])
# Start scrolling
total_docs = 0
while scroll_size > 0:
total_docs += scroll_size
for i in range(0, scroll_size):
doc = page['hits']['hits'][i]['_source']
create_doc_entities_from_to(neo, doc, doc["email"]["body"][0])
# if ('processed' in doc) and ('faces_search' in doc['processed']) \
# and ('embedding_id' in doc['processed']['faces_search'][0]):
# face_id_list = doc['processed']['faces_search'][0][vector_key]
# if len(face_id_list) > 0:
# for face_id in face_id_list:
# res = self.tree_service.get_by_id(clustering_container_name,
face_id)
# if res is not None:
# face_ids.append(res[0])
# ids.append(page['hits']['hits'][i]['_id'])
print(doc)