ElasticSearch DSL操作

[toc]

# 创建索引

curl -XPUT http://IP:9200/ads_user_profile -H 'Content-Type: application/json' -H 'Authorization: Basic ZWxhc3RpYzpBdnJpczJAMjIjIQ==' -d'
{
  "settings":{
    "number_of_shards":6,
    "number_of_replicas":0
  },
  "mappings":{
      "properties":{
        "one_id":{
          "type":"keyword"
        },
        "user_groups":{
          "type": "nested",
          "properties":{
            "code":{"type":"keyword"},
            "name":{"type":"keyword"}
          }
        }
      }
  }
}
'

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

# 删除某个索引下全部数据

POST operator_other_index/_delete_by_query?wait_for_completion=false
  {
    "query": {
      "match_all": {}
    }
  }

1
2
3
4
5
6

# Ip查询


GET my-index/_search
{
  "query": {
    "term": {
      "ip_addr": "192.168.0.0/16"
    }
  }
}

1
2
3
4
5
6
7
8
9

针对上面的搜索，我稍微做一下解释：对于上面的 IPv4 的 IP 地址含有4个 bytes，而每个 byte 含有8个 digits。在上面的 /16 即表示前面的 16 位的 digits，也即 192.168。我们可以这么说任何一个 IP 地址位于 192.168.0.0 至 192.168.255.255 都在这个范围内

# 根据ip范围查询

GET operator_other_index/_search
{
  "query": {
    "range": {
      "start_ip": {
        "gte": "192.168.2.100",
        "lte": "192.168.2.102"
      }
    }
  } ,"_source": [
    "start_ip",
    "end_ip"
  ]
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14

# 获取重复数据

GET test.project/_search
{
    "size":0,
    "aggs":{
        "field":{
            "terms":{
                "field":"id.keyword",
                "size":3000,
                "min_doc_count":1
            }
        }
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14

# 获取去重后数量

GET test.project/_search
{
  "size": 0, 
  "aggs": {
    "count": {
      "cardinality": {
        "field": "id.keyword"
      }
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11

# 模糊(Like)匹配单个字段

GET operator_other_index/_search
{
    "query":{
        "wildcard":{
            "certificate_code":"*824607*"
        }
    }
}

1
2
3
4
5
6
7
8

# 模糊(Like)匹配多个字段

GET operator_other_index/_search
{
    "query":{
        "bool":{
            "should":[
                {
                    "wildcard":{
                        "name":"*张*"
                    }
                },
                {
                    "wildcard":{
                        "emergency_contact_name":"*张*"
                    }
                },
                {
                    "wildcard":{
                        "certificate_type":"*张*"
                    }
                }
            ]
        }
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24

# 查询只返回某些指定字段

返回start_ip与end_ip字段

GET operator_other_index/_search
{
  "_source": [
    "start_ip",
    "end_ip"
  ]
}

1
2
3
4
5
6
7

# 多字段检索 multi_match

multi_match 说明：https://www.elastic.co/guide/cn/elasticsearch/guide/current/multi-match-query.html

GET operator_other_index/_search
{
  "query": {
    "multi_match": {
      "query": "互联网数据中心编码",
      "fields": ["data_center_service_code","computer_room_address","name","user_type","credit_code","address","emergency_contact_name","certificate_code","certificate_type","mobile_phone","phone"]
    }
  }
}

1
2
3
4
5
6
7
8
9

# 全字段检索

GET operator_other_index/_search
{
  "query": {
    "multi_match": {
      "query": "互联网数据中心编码"
    }
  }
}

1
2
3
4
5
6
7
8

# 全字段检索 - 设置完全匹配 minimum_should_match

参考：https://blog.csdn.net/qq_22985751/article/details/90704189

这里写100% 即是必须命中搜索词

GET operator_other_index/_search
{
  "query": {
    "multi_match": {
      "query": "申伟",
      "fields": ["data_center_service_code","computer_room_address","name","user_type","credit_code","address","emergency_contact_name","certificate_code","certificate_type","mobile_phone","phone"],
    "minimum_should_match":"100%"
    }
  }
}

1
2
3
4
5
6
7
8
9
10

# 根据keyword字段进行group by

java代码：https://www.cnblogs.com/xionggeclub/p/7975982.html

GET log_lnk_data_flow_index/_search
{
  "size":0,
  "aggs": {     
    "group_by_keyword": {    
      "terms": { 
        "field": "task_keyword" 
        ,"size": 40000
        ,"order": {
          "_count": "asc"
        }
      }  
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

# 根据ID更新数据

数据必须存在，如果之前不存在则会报错，报错内容如下

{
  "error" : {
    "root_cause" : [
      {
        "type" : "document_missing_exception",
        "reason" : "[_doc][2]: document missing",
        "index_uuid" : "KhAqJx5SR7uJIVZkdO0LIw",
        "shard" : "0",
        "index" : "index1"
      }
    ],
    "type" : "document_missing_exception",
    "reason" : "[_doc][2]: document missing",
    "index_uuid" : "KhAqJx5SR7uJIVZkdO0LIw",
    "shard" : "0",
    "index" : "index1"
  },
  "status" : 404
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

POST /customer/_update/1?pretty
{
  "doc": { "name": "Jane Doe", "age": 20 }
}

1
2
3
4

# Upsert操作

upsert 操作用于如果指定的 document 不存在，就执行 upsert 中的初始化操作；如果指定的 document 存在，就执行 doc 或者 script 指定的 partial update 操作

往index1所用中添加id为3的数据，如果id为3的数据不存在，则使用upsert下的数据修改或新增字段counter为1;如果存在则使用doc下的数据修改或新增字段name为new_name

POST index1/_update/3
{
    "doc" : {
        "name" : "new_name"
    },
    "upsert" : {
        "counter" : 1
    }
}

1
2
3
4
5
6
7
8
9

script demo

数据存在则将num字段值加1，数据不存在则添加upsert下的字段

POST indexname/_update/id
{
   "script" : "ctx._source.num+=1",
   "upsert": {
		 "field1":"value1",
      	 "field2":"value2"
	}
}

1
2
3
4
5
6
7
8

# 查询neseted类型字段满足A=1和A=2的数据

GET voc_data/_search
{
  "query": {
    "bool": {
      "must": [
        {
          "nested": {
            "path": "sentiment_label",
            "query": {
              "bool": {
                "must": [
                  { "match": { "sentiment_label.attribute_code": "32033" } }
                ]
              }
            }
          }
        },
        {
          "nested": {
            "path": "sentiment_label",
            "query": {
              "bool": {
                "must": [
                  { "match": { "sentiment_label.attribute_code": "31141" } }
                ]
              }
            }
          }
        }
      ]
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

# 根据nested类型字段进行集合统计数据

参考：https://www.cnblogs.com/niulang/p/16455158.html (opens new window)

user_groups是嵌套字段，类型type=nested，user_groups.name 是他一个子属性，即: [{user_groups.name}, {user_groups.name}, {user_groups.name}]

POST /ads_user_profile/_search?scroll=2m
{
  "timeout": "6000s", 
  "aggregations": {
      "test": {
          "nested": {
              "path": "user_groups"
          },
          "aggregations": {
              "tag_bucket": {
                  "terms": {
                      "field": "user_groups.name"
                  }
              }
          }
      }
  },
  "query": { //查询条件可有可无
    "term": {
      "one_id": {
        "value": "153180716"
      }
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

返回结果

 {
    "test" : {
      "doc_count" : 98840193,
      "tag_bucket" : {
        "doc_count_error_upper_bound" : 0,
        "sum_other_doc_count" : 12336025,
        "buckets" : [
          {
            "key" : "测试新增",
            "doc_count" : 12764541
          },
          {
            "key" : "标签年龄段不等于0-18",
            "doc_count" : 12555174
          }
        ]
      }
    }}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

# 根据指定条件，查询nested字段中b属性等于x，对a属性进行聚合统计，其他不符合条件的子属性不统计

参考：https://blog.csdn.net/qq_23030337/article/details/123005664 (opens new window)

已有数据情况如下：

有500多万数据如下
[
  {
    "first_login_time" : "1652407073",
    "user_groups" : [
            {
              "code" : "A001",
              "name" : "测试统计字段"
            },
            {
              "code" : "B001",
              "name" : "测试统计字段2"
            }
    ]
  },......
]

有1条数据如下
[
  {
    "first_login_time" : "1652407073",
    "user_groups" : [
            {
              "code" : "A002",
              "name" : "测试统计字段"
            },
            {
              "code" : "B001",
              "name" : "测试统计字段2"
            }
    ]
  },......
]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

实现统计效果：

统计 name=测试统计字段,并对name=测试统计字段的code值进行聚合统计数量，不包含name等于其他值的统计

查询语句：

POST index_name/_search?scroll=2m
{
    "timeout":"6000s",
    "aggregations":{
        "test":{
            "nested":{
                "path":"user_groups"
            },
            "aggregations":{
                "tag_bucket":{
                    "filter":{
                        "term":{  // 仅统计 user_groups.name = 测试统计字段 的数据，其他的不统计
                            "user_groups.name":"测试统计字段"
                        }
                    },
                    "aggregations":{
                        "group_count":{ //自定义的统计名称
                            "terms":{  // 根据 user_groups.code 进行聚合数量统计
                                "field":"user_groups.code"
                            }
                        }
                    }
                }
            }
        }
    },
    "query":{
        "nested":{
            "query":{
                "term":{
                    "user_groups.name":{ //最外层查询条件，查询 user_groups.name = 测试统计字段 的数据
                        "value":"测试统计字段",
                        "boost":1
                    }
                }
            },
            "path":"user_groups",
            "ignore_unmapped":false,
            "score_mode":"none",
            "boost":1
        }
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43

查询结果

{
  "_scroll_id" : "FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFjJ6X2lpRXVWUy1DWlVubDJVUGYxVmcAAAAAAxL1ABYxeWhadXI5dVJNcUVsX290ZGRrUEtn",
  "took" : 3205,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "aggregations" : {
    "test" : {
      "doc_count" : 60977543,
      "tag_bucket" : {
        "doc_count" : 5995453,
        "group_count" : {
          "doc_count_error_upper_bound" : 0,
          "sum_other_doc_count" : 0,
          "buckets" : [
            {
              "key" : "A001",
              "doc_count" : 5995452
            },
            {
              "key" : "A002",
              "doc_count" : 1
            }
          ]
        }
      }
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

参考：https://www.xjx100.cn/news/132050.html?action=onClick

# 时间查询语法

这里做下简单介绍

相对时间查询：

now：表示当前时间点。 now-1h：表示从当前时间向前推算的1小时。 now-2d：表示从当前时间向前推算的2天。 now-1w：表示从当前时间向前推算的1周。 now-1M：表示从当前时间向前推算的1个月。 now-1y：表示从当前时间向前推算的1年。

绝对时间查询：

2021-09-01：表示指定的日期，不包括具体的时间。 2021-09-01T10:00:00：表示指定的日期和时间。

查询方法

{"range": {"timestamp": {"gte": "now-1d", "lt": "now"}}}：表示查询从过去一天内的数据，包括当前时间之前的数据。 {"range": {"timestamp": {"gte": "2021-09-01", "lt": "2021-09-02"}}}：表示查询指定日期范围内的数据，不包括结束日期的数据。

{"range": {"timestamp": {"time_zone": "+08:00", "gte": "now-1h", "lte": "now"}}}：表示在指定的时间区间内进行查询，并指定时区。

{"bool": {"filter": {"range": {"timestamp": {"gte": "now-1h/h", "lte": "now/h"}}}}}：表示查询过去一小时内每个完整小时的数据。

# 删除嵌套子文档

示例数据

假设我们有以下示例数据存储在 Elasticsearch 的 voc_data 索引中：

{
  "_index": "voc_data",
  "_type": "_doc",
  "_id": "4438234404442089",
  "_version": 1,
  "_seq_no": 3606641,
  "_primary_term": 1,
  "_ignored": ["content.keyword"],
  "found": true,
  "_source": {
    "comment_num": 0,
    "sentiment": -1,
    "is_internal": 0,
    "forward_num": 0,
    "poc": 1,
    "vehicle_model": "320",
    "insert_time": "2024-03-27 12:33:27",
    "mid": "4438234404442089",
    "collect_num": 0,
    "videos": [],
    "title": "#车质网观点# 【大众捷达和日产轩逸哪款质量更好？】问题：从论坛上看到13代轩逸的变速箱问题比较多，",
    "type": "1",
    "content": "#车质网观点# 【大众捷达和日产轩逸哪款质量更好？】问题：从论坛上看到13代轩逸的变速箱问题比较多，但是挺喜欢它的空间和油耗，捷达挺实用，空间和舒适和捷达差一点，不知道怎么选择，或者有没有其它的可以推荐的，10万元左右的，开着省心，问题少的。答复：您好！#日产轩逸#刹车问题主要存在于16款车型上，另外发动机烧机油问题主要存在于其1.6L车型上，且该问题在这款车中较为普遍，1.8L车型很少收到此类投诉。如果预算充足，建议考虑日产轩逸1.8L车型。另外日产轩逸所使用的CVT变速箱质量可靠性一般，老款车型有不少投诉变速箱阀体故障的。建议观察一段时间。#捷达#这款车质量相对稳定，动力油耗表现不错，作为代步来讲不错。从车质网的投诉来看之前车型有客户反应发动机异响和车身漏水的问题。不过当前已经很少收到此类投诉，更多车型信息见网页链接。建议您亲自试驾后，进行选择。（如有更多咨询，请点击网页链接，选择“提问”，会有专家来答复。）",
    "read_num": 0,
    "update_time": "2024-03-27 12:33:27",
    "sphere": 0,
    "release_day": 20191113,
    "sentiment_label": [
      {
        "sentiment": 0,
        "viewpoint": "舒适",
        "voice_type": 1,
        "first_classification": "10003",
        "attribute": "空间",
        "secondary_classification": "20048",
        "attribute_code": "34110"
      },
      {
        "sentiment": 0,
        "viewpoint": "好",
        "voice_type": 0,
        "attribute": "质量"
      },
      {
        "sentiment": 1,
        "viewpoint": "一般",
        "voice_type": 0,
        "attribute": "可靠性"
      }
    ],
    "brand": 14,
    "pics": ["https://wx1.sinaimg.cn/orj360/5f21faccly1g8p81vmi30j205k05kdg2.jpg"],
    "release_time": "2019-11-13 18:30:03",
    "like_num": 0,
    "author": "车质网",
    "data_source_platform": 2,
    "source_type": "9",
    "click_num": 0,
    "url": "https://m.weibo.cn/status/Ig2EoiDAB?mblogid=Ig2EoiDAB",
    "is_complaint": false,
    "complaint_status": false,
    "user_id": "1596062412",
    "series": 101,
    "article_summary": "捷达",
    "region": "",
    "private_source_type": "903"
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

查询操作

我们首先需要查询包含特定 attribute_code 的文档。如下所示，我们使用 nested 查询来查找包含 attribute_code = 34110 的文档：

GET /voc_data/_search
{
  "query": {
    "nested": {
      "path": "sentiment_label",
      "query": {
        "term": {
          "sentiment_label.attribute_code": "34110"
        }
      }
    }
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13

上面的查询将返回所有包含 sentiment_label.attribute_code 等于 34110 的文档。

删除子文档操作

接下来，我们将使用 update_by_query API 和 painless 脚本从匹配的文档中删除 attribute_code = 34110 的子文档：

POST /voc_data/_update_by_query
{
  "query": {
    "nested": {
      "path": "sentiment_label",
      "query": {
        "term": {
          "sentiment_label.attribute_code": "34110"
        }
      }
    }
  },
  "script": {
    "source": """
      ctx._source.sentiment_label.removeIf(label -> label.attribute_code == '34110');
    """,
    "lang": "painless"
  }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

解释

查询部分：与之前的查询相同，用于查找包含目标 attribute_code 的文档。
脚本部分：使用 painless 脚本从 sentiment_label 数组中移除 attribute_code 为 34110 的子文档。

上次更新: 2025/01/03, 22:36:28

← ElasticSearch API查看集群状态 ElasticSearch Script操作数据→