sqlglot.dialects.bigquery
1from __future__ import annotations 2 3import logging 4import re 5import typing as t 6 7from sqlglot import exp, generator, parser, tokens, transforms 8from sqlglot._typing import E 9from sqlglot.dialects.dialect import ( 10 Dialect, 11 NormalizationStrategy, 12 annotate_with_type_lambda, 13 arg_max_or_min_no_count, 14 binary_from_function, 15 date_add_interval_sql, 16 datestrtodate_sql, 17 build_formatted_time, 18 filter_array_using_unnest, 19 if_sql, 20 inline_array_unless_query, 21 max_or_greatest, 22 min_or_least, 23 no_ilike_sql, 24 build_date_delta_with_interval, 25 regexp_replace_sql, 26 rename_func, 27 sha256_sql, 28 timestrtotime_sql, 29 ts_or_ds_add_cast, 30 unit_to_var, 31 strposition_sql, 32 groupconcat_sql, 33) 34from sqlglot.helper import seq_get, split_num_words 35from sqlglot.tokens import TokenType 36from sqlglot.generator import unsupported_args 37 38if t.TYPE_CHECKING: 39 from sqlglot._typing import Lit 40 41 from sqlglot.optimizer.annotate_types import TypeAnnotator 42 43logger = logging.getLogger("sqlglot") 44 45 46JSON_EXTRACT_TYPE = t.Union[exp.JSONExtract, exp.JSONExtractScalar, exp.JSONExtractArray] 47 48DQUOTES_ESCAPING_JSON_FUNCTIONS = ("JSON_QUERY", "JSON_VALUE", "JSON_QUERY_ARRAY") 49 50 51def _derived_table_values_to_unnest(self: BigQuery.Generator, expression: exp.Values) -> str: 52 if not expression.find_ancestor(exp.From, exp.Join): 53 return self.values_sql(expression) 54 55 structs = [] 56 alias = expression.args.get("alias") 57 for tup in expression.find_all(exp.Tuple): 58 field_aliases = ( 59 alias.columns 60 if alias and alias.columns 61 else (f"_c{i}" for i in range(len(tup.expressions))) 62 ) 63 expressions = [ 64 exp.PropertyEQ(this=exp.to_identifier(name), expression=fld) 65 for name, fld in zip(field_aliases, tup.expressions) 66 ] 67 structs.append(exp.Struct(expressions=expressions)) 68 69 # Due to `UNNEST_COLUMN_ONLY`, it is expected that the table alias be contained in the columns expression 70 alias_name_only = exp.TableAlias(columns=[alias.this]) if alias else None 71 return self.unnest_sql( 72 exp.Unnest(expressions=[exp.array(*structs, copy=False)], alias=alias_name_only) 73 ) 74 75 76def _returnsproperty_sql(self: BigQuery.Generator, expression: exp.ReturnsProperty) -> str: 77 this = expression.this 78 if isinstance(this, exp.Schema): 79 this = f"{self.sql(this, 'this')} <{self.expressions(this)}>" 80 else: 81 this = self.sql(this) 82 return f"RETURNS {this}" 83 84 85def _create_sql(self: BigQuery.Generator, expression: exp.Create) -> str: 86 returns = expression.find(exp.ReturnsProperty) 87 if expression.kind == "FUNCTION" and returns and returns.args.get("is_table"): 88 expression.set("kind", "TABLE FUNCTION") 89 90 if isinstance(expression.expression, (exp.Subquery, exp.Literal)): 91 expression.set("expression", expression.expression.this) 92 93 return self.create_sql(expression) 94 95 96# https://issuetracker.google.com/issues/162294746 97# workaround for bigquery bug when grouping by an expression and then ordering 98# WITH x AS (SELECT 1 y) 99# SELECT y + 1 z 100# FROM x 101# GROUP BY x + 1 102# ORDER by z 103def _alias_ordered_group(expression: exp.Expression) -> exp.Expression: 104 if isinstance(expression, exp.Select): 105 group = expression.args.get("group") 106 order = expression.args.get("order") 107 108 if group and order: 109 aliases = { 110 select.this: select.args["alias"] 111 for select in expression.selects 112 if isinstance(select, exp.Alias) 113 } 114 115 for grouped in group.expressions: 116 if grouped.is_int: 117 continue 118 alias = aliases.get(grouped) 119 if alias: 120 grouped.replace(exp.column(alias)) 121 122 return expression 123 124 125def _pushdown_cte_column_names(expression: exp.Expression) -> exp.Expression: 126 """BigQuery doesn't allow column names when defining a CTE, so we try to push them down.""" 127 if isinstance(expression, exp.CTE) and expression.alias_column_names: 128 cte_query = expression.this 129 130 if cte_query.is_star: 131 logger.warning( 132 "Can't push down CTE column names for star queries. Run the query through" 133 " the optimizer or use 'qualify' to expand the star projections first." 134 ) 135 return expression 136 137 column_names = expression.alias_column_names 138 expression.args["alias"].set("columns", None) 139 140 for name, select in zip(column_names, cte_query.selects): 141 to_replace = select 142 143 if isinstance(select, exp.Alias): 144 select = select.this 145 146 # Inner aliases are shadowed by the CTE column names 147 to_replace.replace(exp.alias_(select, name)) 148 149 return expression 150 151 152def _build_parse_timestamp(args: t.List) -> exp.StrToTime: 153 this = build_formatted_time(exp.StrToTime, "bigquery")([seq_get(args, 1), seq_get(args, 0)]) 154 this.set("zone", seq_get(args, 2)) 155 return this 156 157 158def _build_timestamp(args: t.List) -> exp.Timestamp: 159 timestamp = exp.Timestamp.from_arg_list(args) 160 timestamp.set("with_tz", True) 161 return timestamp 162 163 164def _build_date(args: t.List) -> exp.Date | exp.DateFromParts: 165 expr_type = exp.DateFromParts if len(args) == 3 else exp.Date 166 return expr_type.from_arg_list(args) 167 168 169def _build_to_hex(args: t.List) -> exp.Hex | exp.MD5: 170 # TO_HEX(MD5(..)) is common in BigQuery, so it's parsed into MD5 to simplify its transpilation 171 arg = seq_get(args, 0) 172 return exp.MD5(this=arg.this) if isinstance(arg, exp.MD5Digest) else exp.LowerHex(this=arg) 173 174 175def _array_contains_sql(self: BigQuery.Generator, expression: exp.ArrayContains) -> str: 176 return self.sql( 177 exp.Exists( 178 this=exp.select("1") 179 .from_(exp.Unnest(expressions=[expression.left]).as_("_unnest", table=["_col"])) 180 .where(exp.column("_col").eq(expression.right)) 181 ) 182 ) 183 184 185def _ts_or_ds_add_sql(self: BigQuery.Generator, expression: exp.TsOrDsAdd) -> str: 186 return date_add_interval_sql("DATE", "ADD")(self, ts_or_ds_add_cast(expression)) 187 188 189def _ts_or_ds_diff_sql(self: BigQuery.Generator, expression: exp.TsOrDsDiff) -> str: 190 expression.this.replace(exp.cast(expression.this, exp.DataType.Type.TIMESTAMP)) 191 expression.expression.replace(exp.cast(expression.expression, exp.DataType.Type.TIMESTAMP)) 192 unit = unit_to_var(expression) 193 return self.func("DATE_DIFF", expression.this, expression.expression, unit) 194 195 196def _unix_to_time_sql(self: BigQuery.Generator, expression: exp.UnixToTime) -> str: 197 scale = expression.args.get("scale") 198 timestamp = expression.this 199 200 if scale in (None, exp.UnixToTime.SECONDS): 201 return self.func("TIMESTAMP_SECONDS", timestamp) 202 if scale == exp.UnixToTime.MILLIS: 203 return self.func("TIMESTAMP_MILLIS", timestamp) 204 if scale == exp.UnixToTime.MICROS: 205 return self.func("TIMESTAMP_MICROS", timestamp) 206 207 unix_seconds = exp.cast( 208 exp.Div(this=timestamp, expression=exp.func("POW", 10, scale)), exp.DataType.Type.BIGINT 209 ) 210 return self.func("TIMESTAMP_SECONDS", unix_seconds) 211 212 213def _build_time(args: t.List) -> exp.Func: 214 if len(args) == 1: 215 return exp.TsOrDsToTime(this=args[0]) 216 if len(args) == 2: 217 return exp.Time.from_arg_list(args) 218 return exp.TimeFromParts.from_arg_list(args) 219 220 221def _build_datetime(args: t.List) -> exp.Func: 222 if len(args) == 1: 223 return exp.TsOrDsToDatetime.from_arg_list(args) 224 if len(args) == 2: 225 return exp.Datetime.from_arg_list(args) 226 return exp.TimestampFromParts.from_arg_list(args) 227 228 229def _build_regexp_extract( 230 expr_type: t.Type[E], default_group: t.Optional[exp.Expression] = None 231) -> t.Callable[[t.List], E]: 232 def _builder(args: t.List) -> E: 233 try: 234 group = re.compile(args[1].name).groups == 1 235 except re.error: 236 group = False 237 238 # Default group is used for the transpilation of REGEXP_EXTRACT_ALL 239 return expr_type( 240 this=seq_get(args, 0), 241 expression=seq_get(args, 1), 242 position=seq_get(args, 2), 243 occurrence=seq_get(args, 3), 244 group=exp.Literal.number(1) if group else default_group, 245 ) 246 247 return _builder 248 249 250def _build_extract_json_with_default_path(expr_type: t.Type[E]) -> t.Callable[[t.List, Dialect], E]: 251 def _builder(args: t.List, dialect: Dialect) -> E: 252 if len(args) == 1: 253 # The default value for the JSONPath is '$' i.e all of the data 254 args.append(exp.Literal.string("$")) 255 return parser.build_extract_json_with_path(expr_type)(args, dialect) 256 257 return _builder 258 259 260def _str_to_datetime_sql( 261 self: BigQuery.Generator, expression: exp.StrToDate | exp.StrToTime 262) -> str: 263 this = self.sql(expression, "this") 264 dtype = "DATE" if isinstance(expression, exp.StrToDate) else "TIMESTAMP" 265 266 if expression.args.get("safe"): 267 fmt = self.format_time( 268 expression, 269 self.dialect.INVERSE_FORMAT_MAPPING, 270 self.dialect.INVERSE_FORMAT_TRIE, 271 ) 272 return f"SAFE_CAST({this} AS {dtype} FORMAT {fmt})" 273 274 fmt = self.format_time(expression) 275 return self.func(f"PARSE_{dtype}", fmt, this, expression.args.get("zone")) 276 277 278def _annotate_math_functions(self: TypeAnnotator, expression: E) -> E: 279 """ 280 Many BigQuery math functions such as CEIL, FLOOR etc follow this return type convention: 281 +---------+---------+---------+------------+---------+ 282 | INPUT | INT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 283 +---------+---------+---------+------------+---------+ 284 | OUTPUT | FLOAT64 | NUMERIC | BIGNUMERIC | FLOAT64 | 285 +---------+---------+---------+------------+---------+ 286 """ 287 self._annotate_args(expression) 288 289 this: exp.Expression = expression.this 290 291 self._set_type( 292 expression, 293 exp.DataType.Type.DOUBLE if this.is_type(*exp.DataType.INTEGER_TYPES) else this.type, 294 ) 295 return expression 296 297 298@unsupported_args("ins_cost", "del_cost", "sub_cost") 299def _levenshtein_sql(self: BigQuery.Generator, expression: exp.Levenshtein) -> str: 300 max_dist = expression.args.get("max_dist") 301 if max_dist: 302 max_dist = exp.Kwarg(this=exp.var("max_distance"), expression=max_dist) 303 304 return self.func("EDIT_DISTANCE", expression.this, expression.expression, max_dist) 305 306 307def _build_levenshtein(args: t.List) -> exp.Levenshtein: 308 max_dist = seq_get(args, 2) 309 return exp.Levenshtein( 310 this=seq_get(args, 0), 311 expression=seq_get(args, 1), 312 max_dist=max_dist.expression if max_dist else None, 313 ) 314 315 316def _build_format_time(expr_type: t.Type[exp.Expression]) -> t.Callable[[t.List], exp.TimeToStr]: 317 def _builder(args: t.List) -> exp.TimeToStr: 318 return exp.TimeToStr( 319 this=expr_type(this=seq_get(args, 1)), 320 format=seq_get(args, 0), 321 zone=seq_get(args, 2), 322 ) 323 324 return _builder 325 326 327def _build_contains_substring(args: t.List) -> exp.Contains | exp.Anonymous: 328 if len(args) == 3: 329 return exp.Anonymous(this="CONTAINS_SUBSTR", expressions=args) 330 331 # Lowercase the operands in case of transpilation, as exp.Contains 332 # is case-sensitive on other dialects 333 this = exp.Lower(this=seq_get(args, 0)) 334 expr = exp.Lower(this=seq_get(args, 1)) 335 336 return exp.Contains(this=this, expression=expr) 337 338 339def _json_extract_sql(self: BigQuery.Generator, expression: JSON_EXTRACT_TYPE) -> str: 340 name = (expression._meta and expression.meta.get("name")) or expression.sql_name() 341 upper = name.upper() 342 343 dquote_escaping = upper in DQUOTES_ESCAPING_JSON_FUNCTIONS 344 345 if dquote_escaping: 346 self._quote_json_path_key_using_brackets = False 347 348 sql = rename_func(upper)(self, expression) 349 350 if dquote_escaping: 351 self._quote_json_path_key_using_brackets = True 352 353 return sql 354 355 356def _annotate_concat(self: TypeAnnotator, expression: exp.Concat) -> exp.Concat: 357 annotated = self._annotate_by_args(expression, "expressions") 358 359 # Args must be BYTES or types that can be cast to STRING, return type is either BYTES or STRING 360 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#concat 361 if not annotated.is_type(exp.DataType.Type.BINARY, exp.DataType.Type.UNKNOWN): 362 annotated.type = exp.DataType.Type.VARCHAR 363 364 return annotated 365 366 367class BigQuery(Dialect): 368 WEEK_OFFSET = -1 369 UNNEST_COLUMN_ONLY = True 370 SUPPORTS_USER_DEFINED_TYPES = False 371 SUPPORTS_SEMI_ANTI_JOIN = False 372 LOG_BASE_FIRST = False 373 HEX_LOWERCASE = True 374 FORCE_EARLY_ALIAS_REF_EXPANSION = True 375 PRESERVE_ORIGINAL_NAMES = True 376 HEX_STRING_IS_INTEGER_TYPE = True 377 378 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 379 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 380 381 # bigquery udfs are case sensitive 382 NORMALIZE_FUNCTIONS = False 383 384 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 385 TIME_MAPPING = { 386 "%D": "%m/%d/%y", 387 "%E6S": "%S.%f", 388 "%e": "%-d", 389 } 390 391 FORMAT_MAPPING = { 392 "DD": "%d", 393 "MM": "%m", 394 "MON": "%b", 395 "MONTH": "%B", 396 "YYYY": "%Y", 397 "YY": "%y", 398 "HH": "%I", 399 "HH12": "%I", 400 "HH24": "%H", 401 "MI": "%M", 402 "SS": "%S", 403 "SSSSS": "%f", 404 "TZH": "%z", 405 } 406 407 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 408 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 409 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 410 411 # All set operations require either a DISTINCT or ALL specifier 412 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 413 414 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 415 TYPE_TO_EXPRESSIONS = { 416 **Dialect.TYPE_TO_EXPRESSIONS, 417 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 418 } 419 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 420 421 ANNOTATORS = { 422 **Dialect.ANNOTATORS, 423 **{ 424 expr_type: annotate_with_type_lambda(data_type) 425 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 426 for expr_type in expressions 427 }, 428 **{ 429 expr_type: lambda self, e: _annotate_math_functions(self, e) 430 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 431 }, 432 **{ 433 expr_type: lambda self, e: self._annotate_by_args(e, "this") 434 for expr_type in ( 435 exp.Left, 436 exp.Right, 437 exp.Lower, 438 exp.Upper, 439 exp.Pad, 440 exp.Trim, 441 exp.RegexpExtract, 442 exp.RegexpReplace, 443 exp.Repeat, 444 exp.Substring, 445 ) 446 }, 447 exp.Concat: _annotate_concat, 448 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 449 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 450 } 451 452 def normalize_identifier(self, expression: E) -> E: 453 if ( 454 isinstance(expression, exp.Identifier) 455 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 456 ): 457 parent = expression.parent 458 while isinstance(parent, exp.Dot): 459 parent = parent.parent 460 461 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 462 # by default. The following check uses a heuristic to detect tables based on whether 463 # they are qualified. This should generally be correct, because tables in BigQuery 464 # must be qualified with at least a dataset, unless @@dataset_id is set. 465 case_sensitive = ( 466 isinstance(parent, exp.UserDefinedFunction) 467 or ( 468 isinstance(parent, exp.Table) 469 and parent.db 470 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 471 ) 472 or expression.meta.get("is_table") 473 ) 474 if not case_sensitive: 475 expression.set("this", expression.this.lower()) 476 477 return t.cast(E, expression) 478 479 return super().normalize_identifier(expression) 480 481 class Tokenizer(tokens.Tokenizer): 482 QUOTES = ["'", '"', '"""', "'''"] 483 COMMENTS = ["--", "#", ("/*", "*/")] 484 IDENTIFIERS = ["`"] 485 STRING_ESCAPES = ["\\"] 486 487 HEX_STRINGS = [("0x", ""), ("0X", "")] 488 489 BYTE_STRINGS = [ 490 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 491 ] 492 493 RAW_STRINGS = [ 494 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 495 ] 496 497 NESTED_COMMENTS = False 498 499 KEYWORDS = { 500 **tokens.Tokenizer.KEYWORDS, 501 "ANY TYPE": TokenType.VARIANT, 502 "BEGIN": TokenType.COMMAND, 503 "BEGIN TRANSACTION": TokenType.BEGIN, 504 "BYTEINT": TokenType.INT, 505 "BYTES": TokenType.BINARY, 506 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 507 "DATETIME": TokenType.TIMESTAMP, 508 "DECLARE": TokenType.COMMAND, 509 "ELSEIF": TokenType.COMMAND, 510 "EXCEPTION": TokenType.COMMAND, 511 "EXPORT": TokenType.EXPORT, 512 "FLOAT64": TokenType.DOUBLE, 513 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 514 "MODEL": TokenType.MODEL, 515 "NOT DETERMINISTIC": TokenType.VOLATILE, 516 "RECORD": TokenType.STRUCT, 517 "TIMESTAMP": TokenType.TIMESTAMPTZ, 518 } 519 KEYWORDS.pop("DIV") 520 KEYWORDS.pop("VALUES") 521 KEYWORDS.pop("/*+") 522 523 class Parser(parser.Parser): 524 PREFIXED_PIVOT_COLUMNS = True 525 LOG_DEFAULTS_TO_LN = True 526 SUPPORTS_IMPLICIT_UNNEST = True 527 JOINS_HAVE_EQUAL_PRECEDENCE = True 528 529 # BigQuery does not allow ASC/DESC to be used as an identifier 530 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 531 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 532 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 533 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 534 TokenType.ASC, 535 TokenType.DESC, 536 } 537 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 538 539 FUNCTIONS = { 540 **parser.Parser.FUNCTIONS, 541 "CONTAINS_SUBSTR": _build_contains_substring, 542 "DATE": _build_date, 543 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 544 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 545 "DATE_TRUNC": lambda args: exp.DateTrunc( 546 unit=seq_get(args, 1), 547 this=seq_get(args, 0), 548 zone=seq_get(args, 2), 549 ), 550 "DATETIME": _build_datetime, 551 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 552 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 553 "DIV": binary_from_function(exp.IntDiv), 554 "EDIT_DISTANCE": _build_levenshtein, 555 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 556 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 557 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 558 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 559 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 560 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 561 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 562 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 563 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 564 "MD5": exp.MD5Digest.from_arg_list, 565 "TO_HEX": _build_to_hex, 566 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 567 [seq_get(args, 1), seq_get(args, 0)] 568 ), 569 "PARSE_TIMESTAMP": _build_parse_timestamp, 570 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 571 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 572 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 573 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 574 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 575 ), 576 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 577 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 578 "SPLIT": lambda args: exp.Split( 579 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 580 this=seq_get(args, 0), 581 expression=seq_get(args, 1) or exp.Literal.string(","), 582 ), 583 "STRPOS": exp.StrPosition.from_arg_list, 584 "TIME": _build_time, 585 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 586 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 587 "TIMESTAMP": _build_timestamp, 588 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 589 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 590 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 591 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 592 ), 593 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 594 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 595 ), 596 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 597 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 598 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 599 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 600 } 601 602 FUNCTION_PARSERS = { 603 **parser.Parser.FUNCTION_PARSERS, 604 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 605 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 606 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 607 } 608 FUNCTION_PARSERS.pop("TRIM") 609 610 NO_PAREN_FUNCTIONS = { 611 **parser.Parser.NO_PAREN_FUNCTIONS, 612 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 613 } 614 615 NESTED_TYPE_TOKENS = { 616 *parser.Parser.NESTED_TYPE_TOKENS, 617 TokenType.TABLE, 618 } 619 620 PROPERTY_PARSERS = { 621 **parser.Parser.PROPERTY_PARSERS, 622 "NOT DETERMINISTIC": lambda self: self.expression( 623 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 624 ), 625 "OPTIONS": lambda self: self._parse_with_property(), 626 } 627 628 CONSTRAINT_PARSERS = { 629 **parser.Parser.CONSTRAINT_PARSERS, 630 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 631 } 632 633 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 634 RANGE_PARSERS.pop(TokenType.OVERLAPS) 635 636 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 637 638 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 639 640 STATEMENT_PARSERS = { 641 **parser.Parser.STATEMENT_PARSERS, 642 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 643 TokenType.END: lambda self: self._parse_as_command(self._prev), 644 TokenType.FOR: lambda self: self._parse_for_in(), 645 TokenType.EXPORT: lambda self: self._parse_export_data(), 646 } 647 648 BRACKET_OFFSETS = { 649 "OFFSET": (0, False), 650 "ORDINAL": (1, False), 651 "SAFE_OFFSET": (0, True), 652 "SAFE_ORDINAL": (1, True), 653 } 654 655 def _parse_for_in(self) -> exp.ForIn: 656 this = self._parse_range() 657 self._match_text_seq("DO") 658 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 659 660 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 661 this = super()._parse_table_part(schema=schema) or self._parse_number() 662 663 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 664 if isinstance(this, exp.Identifier): 665 table_name = this.name 666 while self._match(TokenType.DASH, advance=False) and self._next: 667 start = self._curr 668 while self._is_connected() and not self._match_set( 669 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 670 ): 671 self._advance() 672 673 if start == self._curr: 674 break 675 676 table_name += self._find_sql(start, self._prev) 677 678 this = exp.Identifier( 679 this=table_name, quoted=this.args.get("quoted") 680 ).update_positions(this) 681 elif isinstance(this, exp.Literal): 682 table_name = this.name 683 684 if self._is_connected() and self._parse_var(any_token=True): 685 table_name += self._prev.text 686 687 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 688 689 return this 690 691 def _parse_table_parts( 692 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 693 ) -> exp.Table: 694 table = super()._parse_table_parts( 695 schema=schema, is_db_reference=is_db_reference, wildcard=True 696 ) 697 698 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 699 if not table.catalog: 700 if table.db: 701 previous_db = table.args["db"] 702 parts = table.db.split(".") 703 if len(parts) == 2 and not table.args["db"].quoted: 704 table.set( 705 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 706 ) 707 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 708 else: 709 previous_this = table.this 710 parts = table.name.split(".") 711 if len(parts) == 2 and not table.this.quoted: 712 table.set( 713 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 714 ) 715 table.set( 716 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 717 ) 718 719 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 720 alias = table.this 721 catalog, db, this, *rest = ( 722 exp.to_identifier(p, quoted=True) 723 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 724 ) 725 726 for part in (catalog, db, this): 727 if part: 728 part.update_positions(table.this) 729 730 if rest and this: 731 this = exp.Dot.build([this, *rest]) # type: ignore 732 733 table = exp.Table( 734 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 735 ) 736 table.meta["quoted_table"] = True 737 else: 738 alias = None 739 740 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 741 # dataset, so if the project identifier is omitted we need to fix the ast so that 742 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 743 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 744 # views, because it would seem like the "catalog" part is set, when it'd actually 745 # be the region/dataset. Merging the two identifiers into a single one is done to 746 # avoid producing a 4-part Table reference, which would cause issues in the schema 747 # module, when there are 3-part table names mixed with information schema views. 748 # 749 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 750 table_parts = table.parts 751 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 752 # We need to alias the table here to avoid breaking existing qualified columns. 753 # This is expected to be safe, because if there's an actual alias coming up in 754 # the token stream, it will overwrite this one. If there isn't one, we are only 755 # exposing the name that can be used to reference the view explicitly (a no-op). 756 exp.alias_( 757 table, 758 t.cast(exp.Identifier, alias or table_parts[-1]), 759 table=True, 760 copy=False, 761 ) 762 763 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 764 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 765 line=table_parts[-2].meta.get("line"), 766 col=table_parts[-1].meta.get("col"), 767 start=table_parts[-2].meta.get("start"), 768 end=table_parts[-1].meta.get("end"), 769 ) 770 table.set("this", new_this) 771 table.set("db", seq_get(table_parts, -3)) 772 table.set("catalog", seq_get(table_parts, -4)) 773 774 return table 775 776 def _parse_column(self) -> t.Optional[exp.Expression]: 777 column = super()._parse_column() 778 if isinstance(column, exp.Column): 779 parts = column.parts 780 if any("." in p.name for p in parts): 781 catalog, db, table, this, *rest = ( 782 exp.to_identifier(p, quoted=True) 783 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 784 ) 785 786 if rest and this: 787 this = exp.Dot.build([this, *rest]) # type: ignore 788 789 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 790 column.meta["quoted_column"] = True 791 792 return column 793 794 @t.overload 795 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 796 797 @t.overload 798 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 799 800 def _parse_json_object(self, agg=False): 801 json_object = super()._parse_json_object() 802 array_kv_pair = seq_get(json_object.expressions, 0) 803 804 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 805 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 806 if ( 807 array_kv_pair 808 and isinstance(array_kv_pair.this, exp.Array) 809 and isinstance(array_kv_pair.expression, exp.Array) 810 ): 811 keys = array_kv_pair.this.expressions 812 values = array_kv_pair.expression.expressions 813 814 json_object.set( 815 "expressions", 816 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 817 ) 818 819 return json_object 820 821 def _parse_bracket( 822 self, this: t.Optional[exp.Expression] = None 823 ) -> t.Optional[exp.Expression]: 824 bracket = super()._parse_bracket(this) 825 826 if this is bracket: 827 return bracket 828 829 if isinstance(bracket, exp.Bracket): 830 for expression in bracket.expressions: 831 name = expression.name.upper() 832 833 if name not in self.BRACKET_OFFSETS: 834 break 835 836 offset, safe = self.BRACKET_OFFSETS[name] 837 bracket.set("offset", offset) 838 bracket.set("safe", safe) 839 expression.replace(expression.expressions[0]) 840 841 return bracket 842 843 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 844 unnest = super()._parse_unnest(with_alias=with_alias) 845 846 if not unnest: 847 return None 848 849 unnest_expr = seq_get(unnest.expressions, 0) 850 if unnest_expr: 851 from sqlglot.optimizer.annotate_types import annotate_types 852 853 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 854 855 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 856 # in contrast to other dialects such as DuckDB which flattens only the array by default 857 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 858 array_elem.is_type(exp.DataType.Type.STRUCT) 859 for array_elem in unnest_expr._type.expressions 860 ): 861 unnest.set("explode_array", True) 862 863 return unnest 864 865 def _parse_make_interval(self) -> exp.MakeInterval: 866 expr = exp.MakeInterval() 867 868 for arg_key in expr.arg_types: 869 value = self._parse_lambda() 870 871 if not value: 872 break 873 874 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 875 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 876 if isinstance(value, exp.Kwarg): 877 arg_key = value.this.name 878 879 expr.set(arg_key, value) 880 881 self._match(TokenType.COMMA) 882 883 return expr 884 885 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 886 expr = self.expression( 887 exp.FeaturesAtTime, 888 this=(self._match(TokenType.TABLE) and self._parse_table()) 889 or self._parse_select(nested=True), 890 ) 891 892 while self._match(TokenType.COMMA): 893 arg = self._parse_lambda() 894 895 # Get the LHS of the Kwarg and set the arg to that value, e.g 896 # "num_rows => 1" sets the expr's `num_rows` arg 897 if arg: 898 expr.set(arg.this.name, arg) 899 900 return expr 901 902 def _parse_export_data(self) -> exp.Export: 903 self._match_text_seq("DATA") 904 905 return self.expression( 906 exp.Export, 907 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 908 options=self._parse_properties(), 909 this=self._match_text_seq("AS") and self._parse_select(), 910 ) 911 912 class Generator(generator.Generator): 913 INTERVAL_ALLOWS_PLURAL_FORM = False 914 JOIN_HINTS = False 915 QUERY_HINTS = False 916 TABLE_HINTS = False 917 LIMIT_FETCH = "LIMIT" 918 RENAME_TABLE_WITH_DB = False 919 NVL2_SUPPORTED = False 920 UNNEST_WITH_ORDINALITY = False 921 COLLATE_IS_FUNC = True 922 LIMIT_ONLY_LITERALS = True 923 SUPPORTS_TABLE_ALIAS_COLUMNS = False 924 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 925 JSON_KEY_VALUE_PAIR_SEP = "," 926 NULL_ORDERING_SUPPORTED = False 927 IGNORE_NULLS_IN_FUNC = True 928 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 929 CAN_IMPLEMENT_ARRAY_ANY = True 930 SUPPORTS_TO_NUMBER = False 931 NAMED_PLACEHOLDER_TOKEN = "@" 932 HEX_FUNC = "TO_HEX" 933 WITH_PROPERTIES_PREFIX = "OPTIONS" 934 SUPPORTS_EXPLODING_PROJECTIONS = False 935 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 936 SUPPORTS_UNIX_SECONDS = True 937 938 TRANSFORMS = { 939 **generator.Generator.TRANSFORMS, 940 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 941 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 942 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 943 exp.Array: inline_array_unless_query, 944 exp.ArrayContains: _array_contains_sql, 945 exp.ArrayFilter: filter_array_using_unnest, 946 exp.ArrayRemove: filter_array_using_unnest, 947 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 948 exp.CollateProperty: lambda self, e: ( 949 f"DEFAULT COLLATE {self.sql(e, 'this')}" 950 if e.args.get("default") 951 else f"COLLATE {self.sql(e, 'this')}" 952 ), 953 exp.Commit: lambda *_: "COMMIT TRANSACTION", 954 exp.CountIf: rename_func("COUNTIF"), 955 exp.Create: _create_sql, 956 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 957 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 958 exp.DateDiff: lambda self, e: self.func( 959 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 960 ), 961 exp.DateFromParts: rename_func("DATE"), 962 exp.DateStrToDate: datestrtodate_sql, 963 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 964 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 965 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 966 exp.FromTimeZone: lambda self, e: self.func( 967 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 968 ), 969 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 970 exp.GroupConcat: lambda self, e: groupconcat_sql( 971 self, e, func_name="STRING_AGG", within_group=False 972 ), 973 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 974 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 975 exp.If: if_sql(false_value="NULL"), 976 exp.ILike: no_ilike_sql, 977 exp.IntDiv: rename_func("DIV"), 978 exp.Int64: rename_func("INT64"), 979 exp.JSONExtract: _json_extract_sql, 980 exp.JSONExtractArray: _json_extract_sql, 981 exp.JSONExtractScalar: _json_extract_sql, 982 exp.JSONFormat: rename_func("TO_JSON_STRING"), 983 exp.Levenshtein: _levenshtein_sql, 984 exp.Max: max_or_greatest, 985 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 986 exp.MD5Digest: rename_func("MD5"), 987 exp.Min: min_or_least, 988 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 989 exp.RegexpExtract: lambda self, e: self.func( 990 "REGEXP_EXTRACT", 991 e.this, 992 e.expression, 993 e.args.get("position"), 994 e.args.get("occurrence"), 995 ), 996 exp.RegexpExtractAll: lambda self, e: self.func( 997 "REGEXP_EXTRACT_ALL", e.this, e.expression 998 ), 999 exp.RegexpReplace: regexp_replace_sql, 1000 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1001 exp.ReturnsProperty: _returnsproperty_sql, 1002 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1003 exp.Select: transforms.preprocess( 1004 [ 1005 transforms.explode_projection_to_unnest(), 1006 transforms.unqualify_unnest, 1007 transforms.eliminate_distinct_on, 1008 _alias_ordered_group, 1009 transforms.eliminate_semi_and_anti_joins, 1010 ] 1011 ), 1012 exp.SHA: rename_func("SHA1"), 1013 exp.SHA2: sha256_sql, 1014 exp.StabilityProperty: lambda self, e: ( 1015 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1016 ), 1017 exp.String: rename_func("STRING"), 1018 exp.StrPosition: lambda self, e: ( 1019 strposition_sql( 1020 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1021 ) 1022 ), 1023 exp.StrToDate: _str_to_datetime_sql, 1024 exp.StrToTime: _str_to_datetime_sql, 1025 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1026 exp.TimeFromParts: rename_func("TIME"), 1027 exp.TimestampFromParts: rename_func("DATETIME"), 1028 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1029 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1030 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1031 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1032 exp.TimeStrToTime: timestrtotime_sql, 1033 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1034 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1035 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1036 exp.TsOrDsToTime: rename_func("TIME"), 1037 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1038 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1039 exp.Unhex: rename_func("FROM_HEX"), 1040 exp.UnixDate: rename_func("UNIX_DATE"), 1041 exp.UnixToTime: _unix_to_time_sql, 1042 exp.Uuid: lambda *_: "GENERATE_UUID()", 1043 exp.Values: _derived_table_values_to_unnest, 1044 exp.VariancePop: rename_func("VAR_POP"), 1045 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1046 } 1047 1048 SUPPORTED_JSON_PATH_PARTS = { 1049 exp.JSONPathKey, 1050 exp.JSONPathRoot, 1051 exp.JSONPathSubscript, 1052 } 1053 1054 TYPE_MAPPING = { 1055 **generator.Generator.TYPE_MAPPING, 1056 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1057 exp.DataType.Type.BIGINT: "INT64", 1058 exp.DataType.Type.BINARY: "BYTES", 1059 exp.DataType.Type.BLOB: "BYTES", 1060 exp.DataType.Type.BOOLEAN: "BOOL", 1061 exp.DataType.Type.CHAR: "STRING", 1062 exp.DataType.Type.DECIMAL: "NUMERIC", 1063 exp.DataType.Type.DOUBLE: "FLOAT64", 1064 exp.DataType.Type.FLOAT: "FLOAT64", 1065 exp.DataType.Type.INT: "INT64", 1066 exp.DataType.Type.NCHAR: "STRING", 1067 exp.DataType.Type.NVARCHAR: "STRING", 1068 exp.DataType.Type.SMALLINT: "INT64", 1069 exp.DataType.Type.TEXT: "STRING", 1070 exp.DataType.Type.TIMESTAMP: "DATETIME", 1071 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1072 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1073 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1074 exp.DataType.Type.TINYINT: "INT64", 1075 exp.DataType.Type.ROWVERSION: "BYTES", 1076 exp.DataType.Type.UUID: "STRING", 1077 exp.DataType.Type.VARBINARY: "BYTES", 1078 exp.DataType.Type.VARCHAR: "STRING", 1079 exp.DataType.Type.VARIANT: "ANY TYPE", 1080 } 1081 1082 PROPERTIES_LOCATION = { 1083 **generator.Generator.PROPERTIES_LOCATION, 1084 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1085 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1086 } 1087 1088 # WINDOW comes after QUALIFY 1089 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1090 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1091 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1092 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1093 } 1094 1095 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1096 RESERVED_KEYWORDS = { 1097 "all", 1098 "and", 1099 "any", 1100 "array", 1101 "as", 1102 "asc", 1103 "assert_rows_modified", 1104 "at", 1105 "between", 1106 "by", 1107 "case", 1108 "cast", 1109 "collate", 1110 "contains", 1111 "create", 1112 "cross", 1113 "cube", 1114 "current", 1115 "default", 1116 "define", 1117 "desc", 1118 "distinct", 1119 "else", 1120 "end", 1121 "enum", 1122 "escape", 1123 "except", 1124 "exclude", 1125 "exists", 1126 "extract", 1127 "false", 1128 "fetch", 1129 "following", 1130 "for", 1131 "from", 1132 "full", 1133 "group", 1134 "grouping", 1135 "groups", 1136 "hash", 1137 "having", 1138 "if", 1139 "ignore", 1140 "in", 1141 "inner", 1142 "intersect", 1143 "interval", 1144 "into", 1145 "is", 1146 "join", 1147 "lateral", 1148 "left", 1149 "like", 1150 "limit", 1151 "lookup", 1152 "merge", 1153 "natural", 1154 "new", 1155 "no", 1156 "not", 1157 "null", 1158 "nulls", 1159 "of", 1160 "on", 1161 "or", 1162 "order", 1163 "outer", 1164 "over", 1165 "partition", 1166 "preceding", 1167 "proto", 1168 "qualify", 1169 "range", 1170 "recursive", 1171 "respect", 1172 "right", 1173 "rollup", 1174 "rows", 1175 "select", 1176 "set", 1177 "some", 1178 "struct", 1179 "tablesample", 1180 "then", 1181 "to", 1182 "treat", 1183 "true", 1184 "unbounded", 1185 "union", 1186 "unnest", 1187 "using", 1188 "when", 1189 "where", 1190 "window", 1191 "with", 1192 "within", 1193 } 1194 1195 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1196 unit = expression.unit 1197 unit_sql = unit.name if unit.is_string else self.sql(unit) 1198 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1199 1200 def mod_sql(self, expression: exp.Mod) -> str: 1201 this = expression.this 1202 expr = expression.expression 1203 return self.func( 1204 "MOD", 1205 this.unnest() if isinstance(this, exp.Paren) else this, 1206 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1207 ) 1208 1209 def column_parts(self, expression: exp.Column) -> str: 1210 if expression.meta.get("quoted_column"): 1211 # If a column reference is of the form `dataset.table`.name, we need 1212 # to preserve the quoted table path, otherwise the reference breaks 1213 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1214 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1215 return f"{table_path}.{self.sql(expression, 'this')}" 1216 1217 return super().column_parts(expression) 1218 1219 def table_parts(self, expression: exp.Table) -> str: 1220 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1221 # we need to make sure the correct quoting is used in each case. 1222 # 1223 # For example, if there is a CTE x that clashes with a schema name, then the former will 1224 # return the table y in that schema, whereas the latter will return the CTE's y column: 1225 # 1226 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1227 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1228 if expression.meta.get("quoted_table"): 1229 table_parts = ".".join(p.name for p in expression.parts) 1230 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1231 1232 return super().table_parts(expression) 1233 1234 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1235 this = expression.this 1236 if isinstance(this, exp.TsOrDsToDatetime): 1237 func_name = "FORMAT_DATETIME" 1238 elif isinstance(this, exp.TsOrDsToTimestamp): 1239 func_name = "FORMAT_TIMESTAMP" 1240 else: 1241 func_name = "FORMAT_DATE" 1242 1243 time_expr = ( 1244 this 1245 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1246 else expression 1247 ) 1248 return self.func( 1249 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1250 ) 1251 1252 def eq_sql(self, expression: exp.EQ) -> str: 1253 # Operands of = cannot be NULL in BigQuery 1254 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1255 if not isinstance(expression.parent, exp.Update): 1256 return "NULL" 1257 1258 return self.binary(expression, "=") 1259 1260 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1261 parent = expression.parent 1262 1263 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1264 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1265 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1266 return self.func( 1267 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1268 ) 1269 1270 return super().attimezone_sql(expression) 1271 1272 def trycast_sql(self, expression: exp.TryCast) -> str: 1273 return self.cast_sql(expression, safe_prefix="SAFE_") 1274 1275 def bracket_sql(self, expression: exp.Bracket) -> str: 1276 this = expression.this 1277 expressions = expression.expressions 1278 1279 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1280 arg = expressions[0] 1281 if arg.type is None: 1282 from sqlglot.optimizer.annotate_types import annotate_types 1283 1284 arg = annotate_types(arg, dialect=self.dialect) 1285 1286 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1287 # BQ doesn't support bracket syntax with string values for structs 1288 return f"{self.sql(this)}.{arg.name}" 1289 1290 expressions_sql = self.expressions(expression, flat=True) 1291 offset = expression.args.get("offset") 1292 1293 if offset == 0: 1294 expressions_sql = f"OFFSET({expressions_sql})" 1295 elif offset == 1: 1296 expressions_sql = f"ORDINAL({expressions_sql})" 1297 elif offset is not None: 1298 self.unsupported(f"Unsupported array offset: {offset}") 1299 1300 if expression.args.get("safe"): 1301 expressions_sql = f"SAFE_{expressions_sql}" 1302 1303 return f"{self.sql(this)}[{expressions_sql}]" 1304 1305 def in_unnest_op(self, expression: exp.Unnest) -> str: 1306 return self.sql(expression) 1307 1308 def version_sql(self, expression: exp.Version) -> str: 1309 if expression.name == "TIMESTAMP": 1310 expression.set("this", "SYSTEM_TIME") 1311 return super().version_sql(expression) 1312 1313 def contains_sql(self, expression: exp.Contains) -> str: 1314 this = expression.this 1315 expr = expression.expression 1316 1317 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1318 this = this.this 1319 expr = expr.this 1320 1321 return self.func("CONTAINS_SUBSTR", this, expr) 1322 1323 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1324 this = expression.this 1325 1326 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1327 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1328 # because they aren't literals and so the above syntax is invalid BigQuery. 1329 if isinstance(this, exp.Array): 1330 elem = seq_get(this.expressions, 0) 1331 if not (elem and elem.find(exp.Query)): 1332 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1333 1334 return super().cast_sql(expression, safe_prefix=safe_prefix)
368class BigQuery(Dialect): 369 WEEK_OFFSET = -1 370 UNNEST_COLUMN_ONLY = True 371 SUPPORTS_USER_DEFINED_TYPES = False 372 SUPPORTS_SEMI_ANTI_JOIN = False 373 LOG_BASE_FIRST = False 374 HEX_LOWERCASE = True 375 FORCE_EARLY_ALIAS_REF_EXPANSION = True 376 PRESERVE_ORIGINAL_NAMES = True 377 HEX_STRING_IS_INTEGER_TYPE = True 378 379 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#case_sensitivity 380 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_INSENSITIVE 381 382 # bigquery udfs are case sensitive 383 NORMALIZE_FUNCTIONS = False 384 385 # https://cloud.google.com/bigquery/docs/reference/standard-sql/format-elements#format_elements_date_time 386 TIME_MAPPING = { 387 "%D": "%m/%d/%y", 388 "%E6S": "%S.%f", 389 "%e": "%-d", 390 } 391 392 FORMAT_MAPPING = { 393 "DD": "%d", 394 "MM": "%m", 395 "MON": "%b", 396 "MONTH": "%B", 397 "YYYY": "%Y", 398 "YY": "%y", 399 "HH": "%I", 400 "HH12": "%I", 401 "HH24": "%H", 402 "MI": "%M", 403 "SS": "%S", 404 "SSSSS": "%f", 405 "TZH": "%z", 406 } 407 408 # The _PARTITIONTIME and _PARTITIONDATE pseudo-columns are not returned by a SELECT * statement 409 # https://cloud.google.com/bigquery/docs/querying-partitioned-tables#query_an_ingestion-time_partitioned_table 410 PSEUDOCOLUMNS = {"_PARTITIONTIME", "_PARTITIONDATE"} 411 412 # All set operations require either a DISTINCT or ALL specifier 413 SET_OP_DISTINCT_BY_DEFAULT = dict.fromkeys((exp.Except, exp.Intersect, exp.Union), None) 414 415 # BigQuery maps Type.TIMESTAMP to DATETIME, so we need to amend the inferred types 416 TYPE_TO_EXPRESSIONS = { 417 **Dialect.TYPE_TO_EXPRESSIONS, 418 exp.DataType.Type.TIMESTAMPTZ: Dialect.TYPE_TO_EXPRESSIONS[exp.DataType.Type.TIMESTAMP], 419 } 420 TYPE_TO_EXPRESSIONS.pop(exp.DataType.Type.TIMESTAMP) 421 422 ANNOTATORS = { 423 **Dialect.ANNOTATORS, 424 **{ 425 expr_type: annotate_with_type_lambda(data_type) 426 for data_type, expressions in TYPE_TO_EXPRESSIONS.items() 427 for expr_type in expressions 428 }, 429 **{ 430 expr_type: lambda self, e: _annotate_math_functions(self, e) 431 for expr_type in (exp.Floor, exp.Ceil, exp.Log, exp.Ln, exp.Sqrt, exp.Exp, exp.Round) 432 }, 433 **{ 434 expr_type: lambda self, e: self._annotate_by_args(e, "this") 435 for expr_type in ( 436 exp.Left, 437 exp.Right, 438 exp.Lower, 439 exp.Upper, 440 exp.Pad, 441 exp.Trim, 442 exp.RegexpExtract, 443 exp.RegexpReplace, 444 exp.Repeat, 445 exp.Substring, 446 ) 447 }, 448 exp.Concat: _annotate_concat, 449 exp.Sign: lambda self, e: self._annotate_by_args(e, "this"), 450 exp.Split: lambda self, e: self._annotate_by_args(e, "this", array=True), 451 } 452 453 def normalize_identifier(self, expression: E) -> E: 454 if ( 455 isinstance(expression, exp.Identifier) 456 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 457 ): 458 parent = expression.parent 459 while isinstance(parent, exp.Dot): 460 parent = parent.parent 461 462 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 463 # by default. The following check uses a heuristic to detect tables based on whether 464 # they are qualified. This should generally be correct, because tables in BigQuery 465 # must be qualified with at least a dataset, unless @@dataset_id is set. 466 case_sensitive = ( 467 isinstance(parent, exp.UserDefinedFunction) 468 or ( 469 isinstance(parent, exp.Table) 470 and parent.db 471 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 472 ) 473 or expression.meta.get("is_table") 474 ) 475 if not case_sensitive: 476 expression.set("this", expression.this.lower()) 477 478 return t.cast(E, expression) 479 480 return super().normalize_identifier(expression) 481 482 class Tokenizer(tokens.Tokenizer): 483 QUOTES = ["'", '"', '"""', "'''"] 484 COMMENTS = ["--", "#", ("/*", "*/")] 485 IDENTIFIERS = ["`"] 486 STRING_ESCAPES = ["\\"] 487 488 HEX_STRINGS = [("0x", ""), ("0X", "")] 489 490 BYTE_STRINGS = [ 491 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 492 ] 493 494 RAW_STRINGS = [ 495 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 496 ] 497 498 NESTED_COMMENTS = False 499 500 KEYWORDS = { 501 **tokens.Tokenizer.KEYWORDS, 502 "ANY TYPE": TokenType.VARIANT, 503 "BEGIN": TokenType.COMMAND, 504 "BEGIN TRANSACTION": TokenType.BEGIN, 505 "BYTEINT": TokenType.INT, 506 "BYTES": TokenType.BINARY, 507 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 508 "DATETIME": TokenType.TIMESTAMP, 509 "DECLARE": TokenType.COMMAND, 510 "ELSEIF": TokenType.COMMAND, 511 "EXCEPTION": TokenType.COMMAND, 512 "EXPORT": TokenType.EXPORT, 513 "FLOAT64": TokenType.DOUBLE, 514 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 515 "MODEL": TokenType.MODEL, 516 "NOT DETERMINISTIC": TokenType.VOLATILE, 517 "RECORD": TokenType.STRUCT, 518 "TIMESTAMP": TokenType.TIMESTAMPTZ, 519 } 520 KEYWORDS.pop("DIV") 521 KEYWORDS.pop("VALUES") 522 KEYWORDS.pop("/*+") 523 524 class Parser(parser.Parser): 525 PREFIXED_PIVOT_COLUMNS = True 526 LOG_DEFAULTS_TO_LN = True 527 SUPPORTS_IMPLICIT_UNNEST = True 528 JOINS_HAVE_EQUAL_PRECEDENCE = True 529 530 # BigQuery does not allow ASC/DESC to be used as an identifier 531 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 532 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 533 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 534 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 535 TokenType.ASC, 536 TokenType.DESC, 537 } 538 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 539 540 FUNCTIONS = { 541 **parser.Parser.FUNCTIONS, 542 "CONTAINS_SUBSTR": _build_contains_substring, 543 "DATE": _build_date, 544 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 545 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 546 "DATE_TRUNC": lambda args: exp.DateTrunc( 547 unit=seq_get(args, 1), 548 this=seq_get(args, 0), 549 zone=seq_get(args, 2), 550 ), 551 "DATETIME": _build_datetime, 552 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 553 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 554 "DIV": binary_from_function(exp.IntDiv), 555 "EDIT_DISTANCE": _build_levenshtein, 556 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 557 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 558 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 559 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 560 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 561 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 562 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 563 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 564 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 565 "MD5": exp.MD5Digest.from_arg_list, 566 "TO_HEX": _build_to_hex, 567 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 568 [seq_get(args, 1), seq_get(args, 0)] 569 ), 570 "PARSE_TIMESTAMP": _build_parse_timestamp, 571 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 572 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 573 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 574 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 575 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 576 ), 577 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 578 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 579 "SPLIT": lambda args: exp.Split( 580 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 581 this=seq_get(args, 0), 582 expression=seq_get(args, 1) or exp.Literal.string(","), 583 ), 584 "STRPOS": exp.StrPosition.from_arg_list, 585 "TIME": _build_time, 586 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 587 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 588 "TIMESTAMP": _build_timestamp, 589 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 590 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 591 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 592 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 593 ), 594 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 595 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 596 ), 597 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 598 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 599 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 600 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 601 } 602 603 FUNCTION_PARSERS = { 604 **parser.Parser.FUNCTION_PARSERS, 605 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 606 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 607 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 608 } 609 FUNCTION_PARSERS.pop("TRIM") 610 611 NO_PAREN_FUNCTIONS = { 612 **parser.Parser.NO_PAREN_FUNCTIONS, 613 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 614 } 615 616 NESTED_TYPE_TOKENS = { 617 *parser.Parser.NESTED_TYPE_TOKENS, 618 TokenType.TABLE, 619 } 620 621 PROPERTY_PARSERS = { 622 **parser.Parser.PROPERTY_PARSERS, 623 "NOT DETERMINISTIC": lambda self: self.expression( 624 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 625 ), 626 "OPTIONS": lambda self: self._parse_with_property(), 627 } 628 629 CONSTRAINT_PARSERS = { 630 **parser.Parser.CONSTRAINT_PARSERS, 631 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 632 } 633 634 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 635 RANGE_PARSERS.pop(TokenType.OVERLAPS) 636 637 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 638 639 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 640 641 STATEMENT_PARSERS = { 642 **parser.Parser.STATEMENT_PARSERS, 643 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 644 TokenType.END: lambda self: self._parse_as_command(self._prev), 645 TokenType.FOR: lambda self: self._parse_for_in(), 646 TokenType.EXPORT: lambda self: self._parse_export_data(), 647 } 648 649 BRACKET_OFFSETS = { 650 "OFFSET": (0, False), 651 "ORDINAL": (1, False), 652 "SAFE_OFFSET": (0, True), 653 "SAFE_ORDINAL": (1, True), 654 } 655 656 def _parse_for_in(self) -> exp.ForIn: 657 this = self._parse_range() 658 self._match_text_seq("DO") 659 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 660 661 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 662 this = super()._parse_table_part(schema=schema) or self._parse_number() 663 664 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 665 if isinstance(this, exp.Identifier): 666 table_name = this.name 667 while self._match(TokenType.DASH, advance=False) and self._next: 668 start = self._curr 669 while self._is_connected() and not self._match_set( 670 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 671 ): 672 self._advance() 673 674 if start == self._curr: 675 break 676 677 table_name += self._find_sql(start, self._prev) 678 679 this = exp.Identifier( 680 this=table_name, quoted=this.args.get("quoted") 681 ).update_positions(this) 682 elif isinstance(this, exp.Literal): 683 table_name = this.name 684 685 if self._is_connected() and self._parse_var(any_token=True): 686 table_name += self._prev.text 687 688 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 689 690 return this 691 692 def _parse_table_parts( 693 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 694 ) -> exp.Table: 695 table = super()._parse_table_parts( 696 schema=schema, is_db_reference=is_db_reference, wildcard=True 697 ) 698 699 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 700 if not table.catalog: 701 if table.db: 702 previous_db = table.args["db"] 703 parts = table.db.split(".") 704 if len(parts) == 2 and not table.args["db"].quoted: 705 table.set( 706 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 707 ) 708 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 709 else: 710 previous_this = table.this 711 parts = table.name.split(".") 712 if len(parts) == 2 and not table.this.quoted: 713 table.set( 714 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 715 ) 716 table.set( 717 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 718 ) 719 720 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 721 alias = table.this 722 catalog, db, this, *rest = ( 723 exp.to_identifier(p, quoted=True) 724 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 725 ) 726 727 for part in (catalog, db, this): 728 if part: 729 part.update_positions(table.this) 730 731 if rest and this: 732 this = exp.Dot.build([this, *rest]) # type: ignore 733 734 table = exp.Table( 735 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 736 ) 737 table.meta["quoted_table"] = True 738 else: 739 alias = None 740 741 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 742 # dataset, so if the project identifier is omitted we need to fix the ast so that 743 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 744 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 745 # views, because it would seem like the "catalog" part is set, when it'd actually 746 # be the region/dataset. Merging the two identifiers into a single one is done to 747 # avoid producing a 4-part Table reference, which would cause issues in the schema 748 # module, when there are 3-part table names mixed with information schema views. 749 # 750 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 751 table_parts = table.parts 752 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 753 # We need to alias the table here to avoid breaking existing qualified columns. 754 # This is expected to be safe, because if there's an actual alias coming up in 755 # the token stream, it will overwrite this one. If there isn't one, we are only 756 # exposing the name that can be used to reference the view explicitly (a no-op). 757 exp.alias_( 758 table, 759 t.cast(exp.Identifier, alias or table_parts[-1]), 760 table=True, 761 copy=False, 762 ) 763 764 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 765 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 766 line=table_parts[-2].meta.get("line"), 767 col=table_parts[-1].meta.get("col"), 768 start=table_parts[-2].meta.get("start"), 769 end=table_parts[-1].meta.get("end"), 770 ) 771 table.set("this", new_this) 772 table.set("db", seq_get(table_parts, -3)) 773 table.set("catalog", seq_get(table_parts, -4)) 774 775 return table 776 777 def _parse_column(self) -> t.Optional[exp.Expression]: 778 column = super()._parse_column() 779 if isinstance(column, exp.Column): 780 parts = column.parts 781 if any("." in p.name for p in parts): 782 catalog, db, table, this, *rest = ( 783 exp.to_identifier(p, quoted=True) 784 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 785 ) 786 787 if rest and this: 788 this = exp.Dot.build([this, *rest]) # type: ignore 789 790 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 791 column.meta["quoted_column"] = True 792 793 return column 794 795 @t.overload 796 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 797 798 @t.overload 799 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 800 801 def _parse_json_object(self, agg=False): 802 json_object = super()._parse_json_object() 803 array_kv_pair = seq_get(json_object.expressions, 0) 804 805 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 806 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 807 if ( 808 array_kv_pair 809 and isinstance(array_kv_pair.this, exp.Array) 810 and isinstance(array_kv_pair.expression, exp.Array) 811 ): 812 keys = array_kv_pair.this.expressions 813 values = array_kv_pair.expression.expressions 814 815 json_object.set( 816 "expressions", 817 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 818 ) 819 820 return json_object 821 822 def _parse_bracket( 823 self, this: t.Optional[exp.Expression] = None 824 ) -> t.Optional[exp.Expression]: 825 bracket = super()._parse_bracket(this) 826 827 if this is bracket: 828 return bracket 829 830 if isinstance(bracket, exp.Bracket): 831 for expression in bracket.expressions: 832 name = expression.name.upper() 833 834 if name not in self.BRACKET_OFFSETS: 835 break 836 837 offset, safe = self.BRACKET_OFFSETS[name] 838 bracket.set("offset", offset) 839 bracket.set("safe", safe) 840 expression.replace(expression.expressions[0]) 841 842 return bracket 843 844 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 845 unnest = super()._parse_unnest(with_alias=with_alias) 846 847 if not unnest: 848 return None 849 850 unnest_expr = seq_get(unnest.expressions, 0) 851 if unnest_expr: 852 from sqlglot.optimizer.annotate_types import annotate_types 853 854 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 855 856 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 857 # in contrast to other dialects such as DuckDB which flattens only the array by default 858 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 859 array_elem.is_type(exp.DataType.Type.STRUCT) 860 for array_elem in unnest_expr._type.expressions 861 ): 862 unnest.set("explode_array", True) 863 864 return unnest 865 866 def _parse_make_interval(self) -> exp.MakeInterval: 867 expr = exp.MakeInterval() 868 869 for arg_key in expr.arg_types: 870 value = self._parse_lambda() 871 872 if not value: 873 break 874 875 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 876 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 877 if isinstance(value, exp.Kwarg): 878 arg_key = value.this.name 879 880 expr.set(arg_key, value) 881 882 self._match(TokenType.COMMA) 883 884 return expr 885 886 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 887 expr = self.expression( 888 exp.FeaturesAtTime, 889 this=(self._match(TokenType.TABLE) and self._parse_table()) 890 or self._parse_select(nested=True), 891 ) 892 893 while self._match(TokenType.COMMA): 894 arg = self._parse_lambda() 895 896 # Get the LHS of the Kwarg and set the arg to that value, e.g 897 # "num_rows => 1" sets the expr's `num_rows` arg 898 if arg: 899 expr.set(arg.this.name, arg) 900 901 return expr 902 903 def _parse_export_data(self) -> exp.Export: 904 self._match_text_seq("DATA") 905 906 return self.expression( 907 exp.Export, 908 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 909 options=self._parse_properties(), 910 this=self._match_text_seq("AS") and self._parse_select(), 911 ) 912 913 class Generator(generator.Generator): 914 INTERVAL_ALLOWS_PLURAL_FORM = False 915 JOIN_HINTS = False 916 QUERY_HINTS = False 917 TABLE_HINTS = False 918 LIMIT_FETCH = "LIMIT" 919 RENAME_TABLE_WITH_DB = False 920 NVL2_SUPPORTED = False 921 UNNEST_WITH_ORDINALITY = False 922 COLLATE_IS_FUNC = True 923 LIMIT_ONLY_LITERALS = True 924 SUPPORTS_TABLE_ALIAS_COLUMNS = False 925 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 926 JSON_KEY_VALUE_PAIR_SEP = "," 927 NULL_ORDERING_SUPPORTED = False 928 IGNORE_NULLS_IN_FUNC = True 929 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 930 CAN_IMPLEMENT_ARRAY_ANY = True 931 SUPPORTS_TO_NUMBER = False 932 NAMED_PLACEHOLDER_TOKEN = "@" 933 HEX_FUNC = "TO_HEX" 934 WITH_PROPERTIES_PREFIX = "OPTIONS" 935 SUPPORTS_EXPLODING_PROJECTIONS = False 936 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 937 SUPPORTS_UNIX_SECONDS = True 938 939 TRANSFORMS = { 940 **generator.Generator.TRANSFORMS, 941 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 942 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 943 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 944 exp.Array: inline_array_unless_query, 945 exp.ArrayContains: _array_contains_sql, 946 exp.ArrayFilter: filter_array_using_unnest, 947 exp.ArrayRemove: filter_array_using_unnest, 948 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 949 exp.CollateProperty: lambda self, e: ( 950 f"DEFAULT COLLATE {self.sql(e, 'this')}" 951 if e.args.get("default") 952 else f"COLLATE {self.sql(e, 'this')}" 953 ), 954 exp.Commit: lambda *_: "COMMIT TRANSACTION", 955 exp.CountIf: rename_func("COUNTIF"), 956 exp.Create: _create_sql, 957 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 958 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 959 exp.DateDiff: lambda self, e: self.func( 960 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 961 ), 962 exp.DateFromParts: rename_func("DATE"), 963 exp.DateStrToDate: datestrtodate_sql, 964 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 965 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 966 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 967 exp.FromTimeZone: lambda self, e: self.func( 968 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 969 ), 970 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 971 exp.GroupConcat: lambda self, e: groupconcat_sql( 972 self, e, func_name="STRING_AGG", within_group=False 973 ), 974 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 975 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 976 exp.If: if_sql(false_value="NULL"), 977 exp.ILike: no_ilike_sql, 978 exp.IntDiv: rename_func("DIV"), 979 exp.Int64: rename_func("INT64"), 980 exp.JSONExtract: _json_extract_sql, 981 exp.JSONExtractArray: _json_extract_sql, 982 exp.JSONExtractScalar: _json_extract_sql, 983 exp.JSONFormat: rename_func("TO_JSON_STRING"), 984 exp.Levenshtein: _levenshtein_sql, 985 exp.Max: max_or_greatest, 986 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 987 exp.MD5Digest: rename_func("MD5"), 988 exp.Min: min_or_least, 989 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 990 exp.RegexpExtract: lambda self, e: self.func( 991 "REGEXP_EXTRACT", 992 e.this, 993 e.expression, 994 e.args.get("position"), 995 e.args.get("occurrence"), 996 ), 997 exp.RegexpExtractAll: lambda self, e: self.func( 998 "REGEXP_EXTRACT_ALL", e.this, e.expression 999 ), 1000 exp.RegexpReplace: regexp_replace_sql, 1001 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1002 exp.ReturnsProperty: _returnsproperty_sql, 1003 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1004 exp.Select: transforms.preprocess( 1005 [ 1006 transforms.explode_projection_to_unnest(), 1007 transforms.unqualify_unnest, 1008 transforms.eliminate_distinct_on, 1009 _alias_ordered_group, 1010 transforms.eliminate_semi_and_anti_joins, 1011 ] 1012 ), 1013 exp.SHA: rename_func("SHA1"), 1014 exp.SHA2: sha256_sql, 1015 exp.StabilityProperty: lambda self, e: ( 1016 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1017 ), 1018 exp.String: rename_func("STRING"), 1019 exp.StrPosition: lambda self, e: ( 1020 strposition_sql( 1021 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1022 ) 1023 ), 1024 exp.StrToDate: _str_to_datetime_sql, 1025 exp.StrToTime: _str_to_datetime_sql, 1026 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1027 exp.TimeFromParts: rename_func("TIME"), 1028 exp.TimestampFromParts: rename_func("DATETIME"), 1029 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1030 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1031 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1032 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1033 exp.TimeStrToTime: timestrtotime_sql, 1034 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1035 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1036 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1037 exp.TsOrDsToTime: rename_func("TIME"), 1038 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1039 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1040 exp.Unhex: rename_func("FROM_HEX"), 1041 exp.UnixDate: rename_func("UNIX_DATE"), 1042 exp.UnixToTime: _unix_to_time_sql, 1043 exp.Uuid: lambda *_: "GENERATE_UUID()", 1044 exp.Values: _derived_table_values_to_unnest, 1045 exp.VariancePop: rename_func("VAR_POP"), 1046 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1047 } 1048 1049 SUPPORTED_JSON_PATH_PARTS = { 1050 exp.JSONPathKey, 1051 exp.JSONPathRoot, 1052 exp.JSONPathSubscript, 1053 } 1054 1055 TYPE_MAPPING = { 1056 **generator.Generator.TYPE_MAPPING, 1057 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1058 exp.DataType.Type.BIGINT: "INT64", 1059 exp.DataType.Type.BINARY: "BYTES", 1060 exp.DataType.Type.BLOB: "BYTES", 1061 exp.DataType.Type.BOOLEAN: "BOOL", 1062 exp.DataType.Type.CHAR: "STRING", 1063 exp.DataType.Type.DECIMAL: "NUMERIC", 1064 exp.DataType.Type.DOUBLE: "FLOAT64", 1065 exp.DataType.Type.FLOAT: "FLOAT64", 1066 exp.DataType.Type.INT: "INT64", 1067 exp.DataType.Type.NCHAR: "STRING", 1068 exp.DataType.Type.NVARCHAR: "STRING", 1069 exp.DataType.Type.SMALLINT: "INT64", 1070 exp.DataType.Type.TEXT: "STRING", 1071 exp.DataType.Type.TIMESTAMP: "DATETIME", 1072 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1073 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1074 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1075 exp.DataType.Type.TINYINT: "INT64", 1076 exp.DataType.Type.ROWVERSION: "BYTES", 1077 exp.DataType.Type.UUID: "STRING", 1078 exp.DataType.Type.VARBINARY: "BYTES", 1079 exp.DataType.Type.VARCHAR: "STRING", 1080 exp.DataType.Type.VARIANT: "ANY TYPE", 1081 } 1082 1083 PROPERTIES_LOCATION = { 1084 **generator.Generator.PROPERTIES_LOCATION, 1085 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1086 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1087 } 1088 1089 # WINDOW comes after QUALIFY 1090 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1091 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1092 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1093 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1094 } 1095 1096 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1097 RESERVED_KEYWORDS = { 1098 "all", 1099 "and", 1100 "any", 1101 "array", 1102 "as", 1103 "asc", 1104 "assert_rows_modified", 1105 "at", 1106 "between", 1107 "by", 1108 "case", 1109 "cast", 1110 "collate", 1111 "contains", 1112 "create", 1113 "cross", 1114 "cube", 1115 "current", 1116 "default", 1117 "define", 1118 "desc", 1119 "distinct", 1120 "else", 1121 "end", 1122 "enum", 1123 "escape", 1124 "except", 1125 "exclude", 1126 "exists", 1127 "extract", 1128 "false", 1129 "fetch", 1130 "following", 1131 "for", 1132 "from", 1133 "full", 1134 "group", 1135 "grouping", 1136 "groups", 1137 "hash", 1138 "having", 1139 "if", 1140 "ignore", 1141 "in", 1142 "inner", 1143 "intersect", 1144 "interval", 1145 "into", 1146 "is", 1147 "join", 1148 "lateral", 1149 "left", 1150 "like", 1151 "limit", 1152 "lookup", 1153 "merge", 1154 "natural", 1155 "new", 1156 "no", 1157 "not", 1158 "null", 1159 "nulls", 1160 "of", 1161 "on", 1162 "or", 1163 "order", 1164 "outer", 1165 "over", 1166 "partition", 1167 "preceding", 1168 "proto", 1169 "qualify", 1170 "range", 1171 "recursive", 1172 "respect", 1173 "right", 1174 "rollup", 1175 "rows", 1176 "select", 1177 "set", 1178 "some", 1179 "struct", 1180 "tablesample", 1181 "then", 1182 "to", 1183 "treat", 1184 "true", 1185 "unbounded", 1186 "union", 1187 "unnest", 1188 "using", 1189 "when", 1190 "where", 1191 "window", 1192 "with", 1193 "within", 1194 } 1195 1196 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1197 unit = expression.unit 1198 unit_sql = unit.name if unit.is_string else self.sql(unit) 1199 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1200 1201 def mod_sql(self, expression: exp.Mod) -> str: 1202 this = expression.this 1203 expr = expression.expression 1204 return self.func( 1205 "MOD", 1206 this.unnest() if isinstance(this, exp.Paren) else this, 1207 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1208 ) 1209 1210 def column_parts(self, expression: exp.Column) -> str: 1211 if expression.meta.get("quoted_column"): 1212 # If a column reference is of the form `dataset.table`.name, we need 1213 # to preserve the quoted table path, otherwise the reference breaks 1214 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1215 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1216 return f"{table_path}.{self.sql(expression, 'this')}" 1217 1218 return super().column_parts(expression) 1219 1220 def table_parts(self, expression: exp.Table) -> str: 1221 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1222 # we need to make sure the correct quoting is used in each case. 1223 # 1224 # For example, if there is a CTE x that clashes with a schema name, then the former will 1225 # return the table y in that schema, whereas the latter will return the CTE's y column: 1226 # 1227 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1228 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1229 if expression.meta.get("quoted_table"): 1230 table_parts = ".".join(p.name for p in expression.parts) 1231 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1232 1233 return super().table_parts(expression) 1234 1235 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1236 this = expression.this 1237 if isinstance(this, exp.TsOrDsToDatetime): 1238 func_name = "FORMAT_DATETIME" 1239 elif isinstance(this, exp.TsOrDsToTimestamp): 1240 func_name = "FORMAT_TIMESTAMP" 1241 else: 1242 func_name = "FORMAT_DATE" 1243 1244 time_expr = ( 1245 this 1246 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1247 else expression 1248 ) 1249 return self.func( 1250 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1251 ) 1252 1253 def eq_sql(self, expression: exp.EQ) -> str: 1254 # Operands of = cannot be NULL in BigQuery 1255 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1256 if not isinstance(expression.parent, exp.Update): 1257 return "NULL" 1258 1259 return self.binary(expression, "=") 1260 1261 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1262 parent = expression.parent 1263 1264 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1265 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1266 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1267 return self.func( 1268 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1269 ) 1270 1271 return super().attimezone_sql(expression) 1272 1273 def trycast_sql(self, expression: exp.TryCast) -> str: 1274 return self.cast_sql(expression, safe_prefix="SAFE_") 1275 1276 def bracket_sql(self, expression: exp.Bracket) -> str: 1277 this = expression.this 1278 expressions = expression.expressions 1279 1280 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1281 arg = expressions[0] 1282 if arg.type is None: 1283 from sqlglot.optimizer.annotate_types import annotate_types 1284 1285 arg = annotate_types(arg, dialect=self.dialect) 1286 1287 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1288 # BQ doesn't support bracket syntax with string values for structs 1289 return f"{self.sql(this)}.{arg.name}" 1290 1291 expressions_sql = self.expressions(expression, flat=True) 1292 offset = expression.args.get("offset") 1293 1294 if offset == 0: 1295 expressions_sql = f"OFFSET({expressions_sql})" 1296 elif offset == 1: 1297 expressions_sql = f"ORDINAL({expressions_sql})" 1298 elif offset is not None: 1299 self.unsupported(f"Unsupported array offset: {offset}") 1300 1301 if expression.args.get("safe"): 1302 expressions_sql = f"SAFE_{expressions_sql}" 1303 1304 return f"{self.sql(this)}[{expressions_sql}]" 1305 1306 def in_unnest_op(self, expression: exp.Unnest) -> str: 1307 return self.sql(expression) 1308 1309 def version_sql(self, expression: exp.Version) -> str: 1310 if expression.name == "TIMESTAMP": 1311 expression.set("this", "SYSTEM_TIME") 1312 return super().version_sql(expression) 1313 1314 def contains_sql(self, expression: exp.Contains) -> str: 1315 this = expression.this 1316 expr = expression.expression 1317 1318 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1319 this = this.this 1320 expr = expr.this 1321 1322 return self.func("CONTAINS_SUBSTR", this, expr) 1323 1324 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1325 this = expression.this 1326 1327 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1328 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1329 # because they aren't literals and so the above syntax is invalid BigQuery. 1330 if isinstance(this, exp.Array): 1331 elem = seq_get(this.expressions, 0) 1332 if not (elem and elem.find(exp.Query)): 1333 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1334 1335 return super().cast_sql(expression, safe_prefix=safe_prefix)
First day of the week in DATE_TRUNC(week). Defaults to 0 (Monday). -1 would be Sunday.
Whether the base comes first in the LOG
function.
Possible values: True
, False
, None
(two arguments are not supported by LOG
)
Whether alias reference expansion (_expand_alias_refs()) should run before column qualification (_qualify_columns()).
For example:
WITH data AS ( SELECT 1 AS id, 2 AS my_id ) SELECT id AS my_id FROM data WHERE my_id = 1 GROUP BY my_id, HAVING my_id = 1
In most dialects, "my_id" would refer to "data.my_id" across the query, except: - BigQuery, which will forward the alias to GROUP BY + HAVING clauses i.e it resolves to "WHERE my_id = 1 GROUP BY id HAVING id = 1" - Clickhouse, which will forward the alias across the query i.e it resolves to "WHERE id = 1 GROUP BY id HAVING id = 1"
Whether the name of the function should be preserved inside the node's metadata, can be useful for roundtripping deprecated vs new functions that share an AST node e.g JSON_VALUE vs JSON_EXTRACT_SCALAR in BigQuery
Whether hex strings such as x'CC' evaluate to integer or binary/blob type
Specifies the strategy according to which identifiers should be normalized.
Determines how function names are going to be normalized.
Possible values:
"upper" or True: Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
Associates this dialect's time formats with their equivalent Python strftime
formats.
Helper which is used for parsing the special syntax CAST(x AS DATE FORMAT 'yyyy')
.
If empty, the corresponding trie will be constructed off of TIME_MAPPING
.
Columns that are auto-generated by the engine corresponding to this dialect.
For example, such columns may be excluded from SELECT *
queries.
Whether a set operation uses DISTINCT by default. This is None
when either DISTINCT
or ALL
must be explicitly specified.
453 def normalize_identifier(self, expression: E) -> E: 454 if ( 455 isinstance(expression, exp.Identifier) 456 and self.normalization_strategy is NormalizationStrategy.CASE_INSENSITIVE 457 ): 458 parent = expression.parent 459 while isinstance(parent, exp.Dot): 460 parent = parent.parent 461 462 # In BigQuery, CTEs are case-insensitive, but UDF and table names are case-sensitive 463 # by default. The following check uses a heuristic to detect tables based on whether 464 # they are qualified. This should generally be correct, because tables in BigQuery 465 # must be qualified with at least a dataset, unless @@dataset_id is set. 466 case_sensitive = ( 467 isinstance(parent, exp.UserDefinedFunction) 468 or ( 469 isinstance(parent, exp.Table) 470 and parent.db 471 and (parent.meta.get("quoted_table") or not parent.meta.get("maybe_column")) 472 ) 473 or expression.meta.get("is_table") 474 ) 475 if not case_sensitive: 476 expression.set("this", expression.this.lower()) 477 478 return t.cast(E, expression) 479 480 return super().normalize_identifier(expression)
Transforms an identifier in a way that resembles how it'd be resolved by this dialect.
For example, an identifier like FoO
would be resolved as foo
in Postgres, because it
lowercases all unquoted identifiers. On the other hand, Snowflake uppercases them, so
it would resolve it as FOO
. If it was quoted, it'd need to be treated as case-sensitive,
and so any normalization would be prohibited in order to avoid "breaking" the identifier.
There are also dialects like Spark, which are case-insensitive even when quotes are present, and dialects like MySQL, whose resolution rules match those employed by the underlying operating system, for example they may always be case-sensitive in Linux.
Finally, the normalization behavior of some engines can even be controlled through flags, like in Redshift's case, where users can explicitly set enable_case_sensitive_identifier.
SQLGlot aims to understand and handle all of these different behaviors gracefully, so that it can analyze queries in the optimizer and successfully capture their semantics.
Mapping of an escaped sequence (\n
) to its unescaped version (
).
482 class Tokenizer(tokens.Tokenizer): 483 QUOTES = ["'", '"', '"""', "'''"] 484 COMMENTS = ["--", "#", ("/*", "*/")] 485 IDENTIFIERS = ["`"] 486 STRING_ESCAPES = ["\\"] 487 488 HEX_STRINGS = [("0x", ""), ("0X", "")] 489 490 BYTE_STRINGS = [ 491 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("b", "B") 492 ] 493 494 RAW_STRINGS = [ 495 (prefix + q, q) for q in t.cast(t.List[str], QUOTES) for prefix in ("r", "R") 496 ] 497 498 NESTED_COMMENTS = False 499 500 KEYWORDS = { 501 **tokens.Tokenizer.KEYWORDS, 502 "ANY TYPE": TokenType.VARIANT, 503 "BEGIN": TokenType.COMMAND, 504 "BEGIN TRANSACTION": TokenType.BEGIN, 505 "BYTEINT": TokenType.INT, 506 "BYTES": TokenType.BINARY, 507 "CURRENT_DATETIME": TokenType.CURRENT_DATETIME, 508 "DATETIME": TokenType.TIMESTAMP, 509 "DECLARE": TokenType.COMMAND, 510 "ELSEIF": TokenType.COMMAND, 511 "EXCEPTION": TokenType.COMMAND, 512 "EXPORT": TokenType.EXPORT, 513 "FLOAT64": TokenType.DOUBLE, 514 "FOR SYSTEM_TIME": TokenType.TIMESTAMP_SNAPSHOT, 515 "MODEL": TokenType.MODEL, 516 "NOT DETERMINISTIC": TokenType.VOLATILE, 517 "RECORD": TokenType.STRUCT, 518 "TIMESTAMP": TokenType.TIMESTAMPTZ, 519 } 520 KEYWORDS.pop("DIV") 521 KEYWORDS.pop("VALUES") 522 KEYWORDS.pop("/*+")
Inherited Members
- sqlglot.tokens.Tokenizer
- Tokenizer
- SINGLE_TOKENS
- BIT_STRINGS
- HEREDOC_STRINGS
- UNICODE_STRINGS
- VAR_SINGLE_TOKENS
- IDENTIFIER_ESCAPES
- HEREDOC_TAG_IS_IDENTIFIER
- HEREDOC_STRING_ALTERNATIVE
- STRING_ESCAPES_ALLOWED_IN_RAW_STRINGS
- HINT_START
- TOKENS_PRECEDING_HINT
- WHITE_SPACE
- COMMANDS
- COMMAND_PREFIX_TOKENS
- NUMERIC_LITERALS
- dialect
- use_rs_tokenizer
- reset
- tokenize
- tokenize_rs
- size
- sql
- tokens
524 class Parser(parser.Parser): 525 PREFIXED_PIVOT_COLUMNS = True 526 LOG_DEFAULTS_TO_LN = True 527 SUPPORTS_IMPLICIT_UNNEST = True 528 JOINS_HAVE_EQUAL_PRECEDENCE = True 529 530 # BigQuery does not allow ASC/DESC to be used as an identifier 531 ID_VAR_TOKENS = parser.Parser.ID_VAR_TOKENS - {TokenType.ASC, TokenType.DESC} 532 ALIAS_TOKENS = parser.Parser.ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 533 TABLE_ALIAS_TOKENS = parser.Parser.TABLE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 534 COMMENT_TABLE_ALIAS_TOKENS = parser.Parser.COMMENT_TABLE_ALIAS_TOKENS - { 535 TokenType.ASC, 536 TokenType.DESC, 537 } 538 UPDATE_ALIAS_TOKENS = parser.Parser.UPDATE_ALIAS_TOKENS - {TokenType.ASC, TokenType.DESC} 539 540 FUNCTIONS = { 541 **parser.Parser.FUNCTIONS, 542 "CONTAINS_SUBSTR": _build_contains_substring, 543 "DATE": _build_date, 544 "DATE_ADD": build_date_delta_with_interval(exp.DateAdd), 545 "DATE_SUB": build_date_delta_with_interval(exp.DateSub), 546 "DATE_TRUNC": lambda args: exp.DateTrunc( 547 unit=seq_get(args, 1), 548 this=seq_get(args, 0), 549 zone=seq_get(args, 2), 550 ), 551 "DATETIME": _build_datetime, 552 "DATETIME_ADD": build_date_delta_with_interval(exp.DatetimeAdd), 553 "DATETIME_SUB": build_date_delta_with_interval(exp.DatetimeSub), 554 "DIV": binary_from_function(exp.IntDiv), 555 "EDIT_DISTANCE": _build_levenshtein, 556 "FORMAT_DATE": _build_format_time(exp.TsOrDsToDate), 557 "GENERATE_ARRAY": exp.GenerateSeries.from_arg_list, 558 "JSON_EXTRACT_SCALAR": _build_extract_json_with_default_path(exp.JSONExtractScalar), 559 "JSON_EXTRACT_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 560 "JSON_QUERY": parser.build_extract_json_with_path(exp.JSONExtract), 561 "JSON_QUERY_ARRAY": _build_extract_json_with_default_path(exp.JSONExtractArray), 562 "JSON_VALUE": _build_extract_json_with_default_path(exp.JSONExtractScalar), 563 "JSON_VALUE_ARRAY": _build_extract_json_with_default_path(exp.JSONValueArray), 564 "LENGTH": lambda args: exp.Length(this=seq_get(args, 0), binary=True), 565 "MD5": exp.MD5Digest.from_arg_list, 566 "TO_HEX": _build_to_hex, 567 "PARSE_DATE": lambda args: build_formatted_time(exp.StrToDate, "bigquery")( 568 [seq_get(args, 1), seq_get(args, 0)] 569 ), 570 "PARSE_TIMESTAMP": _build_parse_timestamp, 571 "REGEXP_CONTAINS": exp.RegexpLike.from_arg_list, 572 "REGEXP_EXTRACT": _build_regexp_extract(exp.RegexpExtract), 573 "REGEXP_SUBSTR": _build_regexp_extract(exp.RegexpExtract), 574 "REGEXP_EXTRACT_ALL": _build_regexp_extract( 575 exp.RegexpExtractAll, default_group=exp.Literal.number(0) 576 ), 577 "SHA256": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(256)), 578 "SHA512": lambda args: exp.SHA2(this=seq_get(args, 0), length=exp.Literal.number(512)), 579 "SPLIT": lambda args: exp.Split( 580 # https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#split 581 this=seq_get(args, 0), 582 expression=seq_get(args, 1) or exp.Literal.string(","), 583 ), 584 "STRPOS": exp.StrPosition.from_arg_list, 585 "TIME": _build_time, 586 "TIME_ADD": build_date_delta_with_interval(exp.TimeAdd), 587 "TIME_SUB": build_date_delta_with_interval(exp.TimeSub), 588 "TIMESTAMP": _build_timestamp, 589 "TIMESTAMP_ADD": build_date_delta_with_interval(exp.TimestampAdd), 590 "TIMESTAMP_SUB": build_date_delta_with_interval(exp.TimestampSub), 591 "TIMESTAMP_MICROS": lambda args: exp.UnixToTime( 592 this=seq_get(args, 0), scale=exp.UnixToTime.MICROS 593 ), 594 "TIMESTAMP_MILLIS": lambda args: exp.UnixToTime( 595 this=seq_get(args, 0), scale=exp.UnixToTime.MILLIS 596 ), 597 "TIMESTAMP_SECONDS": lambda args: exp.UnixToTime(this=seq_get(args, 0)), 598 "TO_JSON_STRING": exp.JSONFormat.from_arg_list, 599 "FORMAT_DATETIME": _build_format_time(exp.TsOrDsToDatetime), 600 "FORMAT_TIMESTAMP": _build_format_time(exp.TsOrDsToTimestamp), 601 } 602 603 FUNCTION_PARSERS = { 604 **parser.Parser.FUNCTION_PARSERS, 605 "ARRAY": lambda self: self.expression(exp.Array, expressions=[self._parse_statement()]), 606 "MAKE_INTERVAL": lambda self: self._parse_make_interval(), 607 "FEATURES_AT_TIME": lambda self: self._parse_features_at_time(), 608 } 609 FUNCTION_PARSERS.pop("TRIM") 610 611 NO_PAREN_FUNCTIONS = { 612 **parser.Parser.NO_PAREN_FUNCTIONS, 613 TokenType.CURRENT_DATETIME: exp.CurrentDatetime, 614 } 615 616 NESTED_TYPE_TOKENS = { 617 *parser.Parser.NESTED_TYPE_TOKENS, 618 TokenType.TABLE, 619 } 620 621 PROPERTY_PARSERS = { 622 **parser.Parser.PROPERTY_PARSERS, 623 "NOT DETERMINISTIC": lambda self: self.expression( 624 exp.StabilityProperty, this=exp.Literal.string("VOLATILE") 625 ), 626 "OPTIONS": lambda self: self._parse_with_property(), 627 } 628 629 CONSTRAINT_PARSERS = { 630 **parser.Parser.CONSTRAINT_PARSERS, 631 "OPTIONS": lambda self: exp.Properties(expressions=self._parse_with_property()), 632 } 633 634 RANGE_PARSERS = parser.Parser.RANGE_PARSERS.copy() 635 RANGE_PARSERS.pop(TokenType.OVERLAPS) 636 637 NULL_TOKENS = {TokenType.NULL, TokenType.UNKNOWN} 638 639 DASHED_TABLE_PART_FOLLOW_TOKENS = {TokenType.DOT, TokenType.L_PAREN, TokenType.R_PAREN} 640 641 STATEMENT_PARSERS = { 642 **parser.Parser.STATEMENT_PARSERS, 643 TokenType.ELSE: lambda self: self._parse_as_command(self._prev), 644 TokenType.END: lambda self: self._parse_as_command(self._prev), 645 TokenType.FOR: lambda self: self._parse_for_in(), 646 TokenType.EXPORT: lambda self: self._parse_export_data(), 647 } 648 649 BRACKET_OFFSETS = { 650 "OFFSET": (0, False), 651 "ORDINAL": (1, False), 652 "SAFE_OFFSET": (0, True), 653 "SAFE_ORDINAL": (1, True), 654 } 655 656 def _parse_for_in(self) -> exp.ForIn: 657 this = self._parse_range() 658 self._match_text_seq("DO") 659 return self.expression(exp.ForIn, this=this, expression=self._parse_statement()) 660 661 def _parse_table_part(self, schema: bool = False) -> t.Optional[exp.Expression]: 662 this = super()._parse_table_part(schema=schema) or self._parse_number() 663 664 # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#table_names 665 if isinstance(this, exp.Identifier): 666 table_name = this.name 667 while self._match(TokenType.DASH, advance=False) and self._next: 668 start = self._curr 669 while self._is_connected() and not self._match_set( 670 self.DASHED_TABLE_PART_FOLLOW_TOKENS, advance=False 671 ): 672 self._advance() 673 674 if start == self._curr: 675 break 676 677 table_name += self._find_sql(start, self._prev) 678 679 this = exp.Identifier( 680 this=table_name, quoted=this.args.get("quoted") 681 ).update_positions(this) 682 elif isinstance(this, exp.Literal): 683 table_name = this.name 684 685 if self._is_connected() and self._parse_var(any_token=True): 686 table_name += self._prev.text 687 688 this = exp.Identifier(this=table_name, quoted=True).update_positions(this) 689 690 return this 691 692 def _parse_table_parts( 693 self, schema: bool = False, is_db_reference: bool = False, wildcard: bool = False 694 ) -> exp.Table: 695 table = super()._parse_table_parts( 696 schema=schema, is_db_reference=is_db_reference, wildcard=True 697 ) 698 699 # proj-1.db.tbl -- `1.` is tokenized as a float so we need to unravel it here 700 if not table.catalog: 701 if table.db: 702 previous_db = table.args["db"] 703 parts = table.db.split(".") 704 if len(parts) == 2 and not table.args["db"].quoted: 705 table.set( 706 "catalog", exp.Identifier(this=parts[0]).update_positions(previous_db) 707 ) 708 table.set("db", exp.Identifier(this=parts[1]).update_positions(previous_db)) 709 else: 710 previous_this = table.this 711 parts = table.name.split(".") 712 if len(parts) == 2 and not table.this.quoted: 713 table.set( 714 "db", exp.Identifier(this=parts[0]).update_positions(previous_this) 715 ) 716 table.set( 717 "this", exp.Identifier(this=parts[1]).update_positions(previous_this) 718 ) 719 720 if isinstance(table.this, exp.Identifier) and any("." in p.name for p in table.parts): 721 alias = table.this 722 catalog, db, this, *rest = ( 723 exp.to_identifier(p, quoted=True) 724 for p in split_num_words(".".join(p.name for p in table.parts), ".", 3) 725 ) 726 727 for part in (catalog, db, this): 728 if part: 729 part.update_positions(table.this) 730 731 if rest and this: 732 this = exp.Dot.build([this, *rest]) # type: ignore 733 734 table = exp.Table( 735 this=this, db=db, catalog=catalog, pivots=table.args.get("pivots") 736 ) 737 table.meta["quoted_table"] = True 738 else: 739 alias = None 740 741 # The `INFORMATION_SCHEMA` views in BigQuery need to be qualified by a region or 742 # dataset, so if the project identifier is omitted we need to fix the ast so that 743 # the `INFORMATION_SCHEMA.X` bit is represented as a single (quoted) Identifier. 744 # Otherwise, we wouldn't correctly qualify a `Table` node that references these 745 # views, because it would seem like the "catalog" part is set, when it'd actually 746 # be the region/dataset. Merging the two identifiers into a single one is done to 747 # avoid producing a 4-part Table reference, which would cause issues in the schema 748 # module, when there are 3-part table names mixed with information schema views. 749 # 750 # See: https://cloud.google.com/bigquery/docs/information-schema-intro#syntax 751 table_parts = table.parts 752 if len(table_parts) > 1 and table_parts[-2].name.upper() == "INFORMATION_SCHEMA": 753 # We need to alias the table here to avoid breaking existing qualified columns. 754 # This is expected to be safe, because if there's an actual alias coming up in 755 # the token stream, it will overwrite this one. If there isn't one, we are only 756 # exposing the name that can be used to reference the view explicitly (a no-op). 757 exp.alias_( 758 table, 759 t.cast(exp.Identifier, alias or table_parts[-1]), 760 table=True, 761 copy=False, 762 ) 763 764 info_schema_view = f"{table_parts[-2].name}.{table_parts[-1].name}" 765 new_this = exp.Identifier(this=info_schema_view, quoted=True).update_positions( 766 line=table_parts[-2].meta.get("line"), 767 col=table_parts[-1].meta.get("col"), 768 start=table_parts[-2].meta.get("start"), 769 end=table_parts[-1].meta.get("end"), 770 ) 771 table.set("this", new_this) 772 table.set("db", seq_get(table_parts, -3)) 773 table.set("catalog", seq_get(table_parts, -4)) 774 775 return table 776 777 def _parse_column(self) -> t.Optional[exp.Expression]: 778 column = super()._parse_column() 779 if isinstance(column, exp.Column): 780 parts = column.parts 781 if any("." in p.name for p in parts): 782 catalog, db, table, this, *rest = ( 783 exp.to_identifier(p, quoted=True) 784 for p in split_num_words(".".join(p.name for p in parts), ".", 4) 785 ) 786 787 if rest and this: 788 this = exp.Dot.build([this, *rest]) # type: ignore 789 790 column = exp.Column(this=this, table=table, db=db, catalog=catalog) 791 column.meta["quoted_column"] = True 792 793 return column 794 795 @t.overload 796 def _parse_json_object(self, agg: Lit[False]) -> exp.JSONObject: ... 797 798 @t.overload 799 def _parse_json_object(self, agg: Lit[True]) -> exp.JSONObjectAgg: ... 800 801 def _parse_json_object(self, agg=False): 802 json_object = super()._parse_json_object() 803 array_kv_pair = seq_get(json_object.expressions, 0) 804 805 # Converts BQ's "signature 2" of JSON_OBJECT into SQLGlot's canonical representation 806 # https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions#json_object_signature2 807 if ( 808 array_kv_pair 809 and isinstance(array_kv_pair.this, exp.Array) 810 and isinstance(array_kv_pair.expression, exp.Array) 811 ): 812 keys = array_kv_pair.this.expressions 813 values = array_kv_pair.expression.expressions 814 815 json_object.set( 816 "expressions", 817 [exp.JSONKeyValue(this=k, expression=v) for k, v in zip(keys, values)], 818 ) 819 820 return json_object 821 822 def _parse_bracket( 823 self, this: t.Optional[exp.Expression] = None 824 ) -> t.Optional[exp.Expression]: 825 bracket = super()._parse_bracket(this) 826 827 if this is bracket: 828 return bracket 829 830 if isinstance(bracket, exp.Bracket): 831 for expression in bracket.expressions: 832 name = expression.name.upper() 833 834 if name not in self.BRACKET_OFFSETS: 835 break 836 837 offset, safe = self.BRACKET_OFFSETS[name] 838 bracket.set("offset", offset) 839 bracket.set("safe", safe) 840 expression.replace(expression.expressions[0]) 841 842 return bracket 843 844 def _parse_unnest(self, with_alias: bool = True) -> t.Optional[exp.Unnest]: 845 unnest = super()._parse_unnest(with_alias=with_alias) 846 847 if not unnest: 848 return None 849 850 unnest_expr = seq_get(unnest.expressions, 0) 851 if unnest_expr: 852 from sqlglot.optimizer.annotate_types import annotate_types 853 854 unnest_expr = annotate_types(unnest_expr, dialect=self.dialect) 855 856 # Unnesting a nested array (i.e array of structs) explodes the top-level struct fields, 857 # in contrast to other dialects such as DuckDB which flattens only the array by default 858 if unnest_expr.is_type(exp.DataType.Type.ARRAY) and any( 859 array_elem.is_type(exp.DataType.Type.STRUCT) 860 for array_elem in unnest_expr._type.expressions 861 ): 862 unnest.set("explode_array", True) 863 864 return unnest 865 866 def _parse_make_interval(self) -> exp.MakeInterval: 867 expr = exp.MakeInterval() 868 869 for arg_key in expr.arg_types: 870 value = self._parse_lambda() 871 872 if not value: 873 break 874 875 # Non-named arguments are filled sequentially, (optionally) followed by named arguments 876 # that can appear in any order e.g MAKE_INTERVAL(1, minute => 5, day => 2) 877 if isinstance(value, exp.Kwarg): 878 arg_key = value.this.name 879 880 expr.set(arg_key, value) 881 882 self._match(TokenType.COMMA) 883 884 return expr 885 886 def _parse_features_at_time(self) -> exp.FeaturesAtTime: 887 expr = self.expression( 888 exp.FeaturesAtTime, 889 this=(self._match(TokenType.TABLE) and self._parse_table()) 890 or self._parse_select(nested=True), 891 ) 892 893 while self._match(TokenType.COMMA): 894 arg = self._parse_lambda() 895 896 # Get the LHS of the Kwarg and set the arg to that value, e.g 897 # "num_rows => 1" sets the expr's `num_rows` arg 898 if arg: 899 expr.set(arg.this.name, arg) 900 901 return expr 902 903 def _parse_export_data(self) -> exp.Export: 904 self._match_text_seq("DATA") 905 906 return self.expression( 907 exp.Export, 908 connection=self._match_text_seq("WITH", "CONNECTION") and self._parse_table_parts(), 909 options=self._parse_properties(), 910 this=self._match_text_seq("AS") and self._parse_select(), 911 )
Parser consumes a list of tokens produced by the Tokenizer and produces a parsed syntax tree.
Arguments:
- error_level: The desired error level. Default: ErrorLevel.IMMEDIATE
- error_message_context: The amount of context to capture from a query string when displaying the error message (in number of characters). Default: 100
- max_errors: Maximum number of error messages to include in a raised ParseError. This is only relevant if error_level is ErrorLevel.RAISE. Default: 3
Inherited Members
- sqlglot.parser.Parser
- Parser
- STRUCT_TYPE_TOKENS
- ENUM_TYPE_TOKENS
- AGGREGATE_TYPE_TOKENS
- TYPE_TOKENS
- SIGNED_TO_UNSIGNED_TYPE_TOKEN
- SUBQUERY_PREDICATES
- RESERVED_TOKENS
- DB_CREATABLES
- CREATABLES
- ALTERABLES
- COLON_PLACEHOLDER_TOKENS
- ARRAY_CONSTRUCTORS
- TRIM_TYPES
- FUNC_TOKENS
- CONJUNCTION
- ASSIGNMENT
- DISJUNCTION
- EQUALITY
- COMPARISON
- BITWISE
- TERM
- FACTOR
- EXPONENT
- TIMES
- TIMESTAMPS
- SET_OPERATIONS
- JOIN_METHODS
- JOIN_SIDES
- JOIN_KINDS
- JOIN_HINTS
- LAMBDAS
- COLUMN_OPERATORS
- EXPRESSION_PARSERS
- UNARY_PARSERS
- STRING_PARSERS
- NUMERIC_PARSERS
- PRIMARY_PARSERS
- PLACEHOLDER_PARSERS
- PIPE_SYNTAX_TRANSFORM_PARSERS
- ALTER_PARSERS
- ALTER_ALTER_PARSERS
- SCHEMA_UNNAMED_CONSTRAINTS
- NO_PAREN_FUNCTION_PARSERS
- INVALID_FUNC_NAME_TOKENS
- FUNCTIONS_WITH_ALIASED_ARGS
- KEY_VALUE_DEFINITIONS
- QUERY_MODIFIER_PARSERS
- SET_PARSERS
- SHOW_PARSERS
- TYPE_LITERAL_PARSERS
- TYPE_CONVERTERS
- DDL_SELECT_TOKENS
- PRE_VOLATILE_TOKENS
- TRANSACTION_KIND
- TRANSACTION_CHARACTERISTICS
- CONFLICT_ACTIONS
- CREATE_SEQUENCE
- ISOLATED_LOADING_OPTIONS
- USABLES
- CAST_ACTIONS
- SCHEMA_BINDING_OPTIONS
- PROCEDURE_OPTIONS
- EXECUTE_AS_OPTIONS
- KEY_CONSTRAINT_OPTIONS
- WINDOW_EXCLUDE_OPTIONS
- INSERT_ALTERNATIVES
- CLONE_KEYWORDS
- HISTORICAL_DATA_PREFIX
- HISTORICAL_DATA_KIND
- OPCLASS_FOLLOW_KEYWORDS
- OPTYPE_FOLLOW_TOKENS
- TABLE_INDEX_HINT_TOKENS
- VIEW_ATTRIBUTES
- WINDOW_ALIAS_TOKENS
- WINDOW_BEFORE_PAREN_TOKENS
- WINDOW_SIDES
- JSON_KEY_VALUE_SEPARATOR_TOKENS
- FETCH_TOKENS
- ADD_CONSTRAINT_TOKENS
- DISTINCT_TOKENS
- UNNEST_OFFSET_ALIAS_TOKENS
- SELECT_START_TOKENS
- COPY_INTO_VARLEN_OPTIONS
- IS_JSON_PREDICATE_KIND
- ODBC_DATETIME_LITERALS
- ON_CONDITION_TOKENS
- PRIVILEGE_FOLLOW_TOKENS
- DESCRIBE_STYLES
- ANALYZE_STYLES
- ANALYZE_EXPRESSION_PARSERS
- PARTITION_KEYWORDS
- AMBIGUOUS_ALIAS_TOKENS
- OPERATION_MODIFIERS
- RECURSIVE_CTE_SEARCH_KIND
- MODIFIABLES
- STRICT_CAST
- IDENTIFY_PIVOT_STRINGS
- TABLESAMPLE_CSV
- DEFAULT_SAMPLING_METHOD
- SET_REQUIRES_ASSIGNMENT_DELIMITER
- TRIM_PATTERN_FIRST
- STRING_ALIASES
- MODIFIERS_ATTACHED_TO_SET_OP
- SET_OP_MODIFIERS
- NO_PAREN_IF_COMMANDS
- JSON_ARROWS_REQUIRE_JSON_TYPE
- COLON_IS_VARIANT_EXTRACT
- VALUES_FOLLOWED_BY_PAREN
- INTERVAL_SPANS
- SUPPORTS_PARTITION_SELECTION
- WRAPPED_TRANSFORM_COLUMN_CONSTRAINT
- OPTIONAL_ALIAS_TOKEN_CTE
- ALTER_RENAME_REQUIRES_COLUMN
- ZONE_AWARE_TIMESTAMP_CONSTRUCTOR
- error_level
- error_message_context
- max_errors
- dialect
- reset
- parse
- parse_into
- check_errors
- raise_error
- expression
- validate_expression
- parse_set_operation
- errors
- sql
913 class Generator(generator.Generator): 914 INTERVAL_ALLOWS_PLURAL_FORM = False 915 JOIN_HINTS = False 916 QUERY_HINTS = False 917 TABLE_HINTS = False 918 LIMIT_FETCH = "LIMIT" 919 RENAME_TABLE_WITH_DB = False 920 NVL2_SUPPORTED = False 921 UNNEST_WITH_ORDINALITY = False 922 COLLATE_IS_FUNC = True 923 LIMIT_ONLY_LITERALS = True 924 SUPPORTS_TABLE_ALIAS_COLUMNS = False 925 UNPIVOT_ALIASES_ARE_IDENTIFIERS = False 926 JSON_KEY_VALUE_PAIR_SEP = "," 927 NULL_ORDERING_SUPPORTED = False 928 IGNORE_NULLS_IN_FUNC = True 929 JSON_PATH_SINGLE_QUOTE_ESCAPE = True 930 CAN_IMPLEMENT_ARRAY_ANY = True 931 SUPPORTS_TO_NUMBER = False 932 NAMED_PLACEHOLDER_TOKEN = "@" 933 HEX_FUNC = "TO_HEX" 934 WITH_PROPERTIES_PREFIX = "OPTIONS" 935 SUPPORTS_EXPLODING_PROJECTIONS = False 936 EXCEPT_INTERSECT_SUPPORT_ALL_CLAUSE = False 937 SUPPORTS_UNIX_SECONDS = True 938 939 TRANSFORMS = { 940 **generator.Generator.TRANSFORMS, 941 exp.ApproxDistinct: rename_func("APPROX_COUNT_DISTINCT"), 942 exp.ArgMax: arg_max_or_min_no_count("MAX_BY"), 943 exp.ArgMin: arg_max_or_min_no_count("MIN_BY"), 944 exp.Array: inline_array_unless_query, 945 exp.ArrayContains: _array_contains_sql, 946 exp.ArrayFilter: filter_array_using_unnest, 947 exp.ArrayRemove: filter_array_using_unnest, 948 exp.Cast: transforms.preprocess([transforms.remove_precision_parameterized_types]), 949 exp.CollateProperty: lambda self, e: ( 950 f"DEFAULT COLLATE {self.sql(e, 'this')}" 951 if e.args.get("default") 952 else f"COLLATE {self.sql(e, 'this')}" 953 ), 954 exp.Commit: lambda *_: "COMMIT TRANSACTION", 955 exp.CountIf: rename_func("COUNTIF"), 956 exp.Create: _create_sql, 957 exp.CTE: transforms.preprocess([_pushdown_cte_column_names]), 958 exp.DateAdd: date_add_interval_sql("DATE", "ADD"), 959 exp.DateDiff: lambda self, e: self.func( 960 "DATE_DIFF", e.this, e.expression, unit_to_var(e) 961 ), 962 exp.DateFromParts: rename_func("DATE"), 963 exp.DateStrToDate: datestrtodate_sql, 964 exp.DateSub: date_add_interval_sql("DATE", "SUB"), 965 exp.DatetimeAdd: date_add_interval_sql("DATETIME", "ADD"), 966 exp.DatetimeSub: date_add_interval_sql("DATETIME", "SUB"), 967 exp.FromTimeZone: lambda self, e: self.func( 968 "DATETIME", self.func("TIMESTAMP", e.this, e.args.get("zone")), "'UTC'" 969 ), 970 exp.GenerateSeries: rename_func("GENERATE_ARRAY"), 971 exp.GroupConcat: lambda self, e: groupconcat_sql( 972 self, e, func_name="STRING_AGG", within_group=False 973 ), 974 exp.Hex: lambda self, e: self.func("UPPER", self.func("TO_HEX", self.sql(e, "this"))), 975 exp.HexString: lambda self, e: self.hexstring_sql(e, binary_function_repr="FROM_HEX"), 976 exp.If: if_sql(false_value="NULL"), 977 exp.ILike: no_ilike_sql, 978 exp.IntDiv: rename_func("DIV"), 979 exp.Int64: rename_func("INT64"), 980 exp.JSONExtract: _json_extract_sql, 981 exp.JSONExtractArray: _json_extract_sql, 982 exp.JSONExtractScalar: _json_extract_sql, 983 exp.JSONFormat: rename_func("TO_JSON_STRING"), 984 exp.Levenshtein: _levenshtein_sql, 985 exp.Max: max_or_greatest, 986 exp.MD5: lambda self, e: self.func("TO_HEX", self.func("MD5", e.this)), 987 exp.MD5Digest: rename_func("MD5"), 988 exp.Min: min_or_least, 989 exp.PartitionedByProperty: lambda self, e: f"PARTITION BY {self.sql(e, 'this')}", 990 exp.RegexpExtract: lambda self, e: self.func( 991 "REGEXP_EXTRACT", 992 e.this, 993 e.expression, 994 e.args.get("position"), 995 e.args.get("occurrence"), 996 ), 997 exp.RegexpExtractAll: lambda self, e: self.func( 998 "REGEXP_EXTRACT_ALL", e.this, e.expression 999 ), 1000 exp.RegexpReplace: regexp_replace_sql, 1001 exp.RegexpLike: rename_func("REGEXP_CONTAINS"), 1002 exp.ReturnsProperty: _returnsproperty_sql, 1003 exp.Rollback: lambda *_: "ROLLBACK TRANSACTION", 1004 exp.Select: transforms.preprocess( 1005 [ 1006 transforms.explode_projection_to_unnest(), 1007 transforms.unqualify_unnest, 1008 transforms.eliminate_distinct_on, 1009 _alias_ordered_group, 1010 transforms.eliminate_semi_and_anti_joins, 1011 ] 1012 ), 1013 exp.SHA: rename_func("SHA1"), 1014 exp.SHA2: sha256_sql, 1015 exp.StabilityProperty: lambda self, e: ( 1016 "DETERMINISTIC" if e.name == "IMMUTABLE" else "NOT DETERMINISTIC" 1017 ), 1018 exp.String: rename_func("STRING"), 1019 exp.StrPosition: lambda self, e: ( 1020 strposition_sql( 1021 self, e, func_name="INSTR", supports_position=True, supports_occurrence=True 1022 ) 1023 ), 1024 exp.StrToDate: _str_to_datetime_sql, 1025 exp.StrToTime: _str_to_datetime_sql, 1026 exp.TimeAdd: date_add_interval_sql("TIME", "ADD"), 1027 exp.TimeFromParts: rename_func("TIME"), 1028 exp.TimestampFromParts: rename_func("DATETIME"), 1029 exp.TimeSub: date_add_interval_sql("TIME", "SUB"), 1030 exp.TimestampAdd: date_add_interval_sql("TIMESTAMP", "ADD"), 1031 exp.TimestampDiff: rename_func("TIMESTAMP_DIFF"), 1032 exp.TimestampSub: date_add_interval_sql("TIMESTAMP", "SUB"), 1033 exp.TimeStrToTime: timestrtotime_sql, 1034 exp.Transaction: lambda *_: "BEGIN TRANSACTION", 1035 exp.TsOrDsAdd: _ts_or_ds_add_sql, 1036 exp.TsOrDsDiff: _ts_or_ds_diff_sql, 1037 exp.TsOrDsToTime: rename_func("TIME"), 1038 exp.TsOrDsToDatetime: rename_func("DATETIME"), 1039 exp.TsOrDsToTimestamp: rename_func("TIMESTAMP"), 1040 exp.Unhex: rename_func("FROM_HEX"), 1041 exp.UnixDate: rename_func("UNIX_DATE"), 1042 exp.UnixToTime: _unix_to_time_sql, 1043 exp.Uuid: lambda *_: "GENERATE_UUID()", 1044 exp.Values: _derived_table_values_to_unnest, 1045 exp.VariancePop: rename_func("VAR_POP"), 1046 exp.SafeDivide: rename_func("SAFE_DIVIDE"), 1047 } 1048 1049 SUPPORTED_JSON_PATH_PARTS = { 1050 exp.JSONPathKey, 1051 exp.JSONPathRoot, 1052 exp.JSONPathSubscript, 1053 } 1054 1055 TYPE_MAPPING = { 1056 **generator.Generator.TYPE_MAPPING, 1057 exp.DataType.Type.BIGDECIMAL: "BIGNUMERIC", 1058 exp.DataType.Type.BIGINT: "INT64", 1059 exp.DataType.Type.BINARY: "BYTES", 1060 exp.DataType.Type.BLOB: "BYTES", 1061 exp.DataType.Type.BOOLEAN: "BOOL", 1062 exp.DataType.Type.CHAR: "STRING", 1063 exp.DataType.Type.DECIMAL: "NUMERIC", 1064 exp.DataType.Type.DOUBLE: "FLOAT64", 1065 exp.DataType.Type.FLOAT: "FLOAT64", 1066 exp.DataType.Type.INT: "INT64", 1067 exp.DataType.Type.NCHAR: "STRING", 1068 exp.DataType.Type.NVARCHAR: "STRING", 1069 exp.DataType.Type.SMALLINT: "INT64", 1070 exp.DataType.Type.TEXT: "STRING", 1071 exp.DataType.Type.TIMESTAMP: "DATETIME", 1072 exp.DataType.Type.TIMESTAMPNTZ: "DATETIME", 1073 exp.DataType.Type.TIMESTAMPTZ: "TIMESTAMP", 1074 exp.DataType.Type.TIMESTAMPLTZ: "TIMESTAMP", 1075 exp.DataType.Type.TINYINT: "INT64", 1076 exp.DataType.Type.ROWVERSION: "BYTES", 1077 exp.DataType.Type.UUID: "STRING", 1078 exp.DataType.Type.VARBINARY: "BYTES", 1079 exp.DataType.Type.VARCHAR: "STRING", 1080 exp.DataType.Type.VARIANT: "ANY TYPE", 1081 } 1082 1083 PROPERTIES_LOCATION = { 1084 **generator.Generator.PROPERTIES_LOCATION, 1085 exp.PartitionedByProperty: exp.Properties.Location.POST_SCHEMA, 1086 exp.VolatileProperty: exp.Properties.Location.UNSUPPORTED, 1087 } 1088 1089 # WINDOW comes after QUALIFY 1090 # https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#window_clause 1091 AFTER_HAVING_MODIFIER_TRANSFORMS = { 1092 "qualify": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["qualify"], 1093 "windows": generator.Generator.AFTER_HAVING_MODIFIER_TRANSFORMS["windows"], 1094 } 1095 1096 # from: https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical#reserved_keywords 1097 RESERVED_KEYWORDS = { 1098 "all", 1099 "and", 1100 "any", 1101 "array", 1102 "as", 1103 "asc", 1104 "assert_rows_modified", 1105 "at", 1106 "between", 1107 "by", 1108 "case", 1109 "cast", 1110 "collate", 1111 "contains", 1112 "create", 1113 "cross", 1114 "cube", 1115 "current", 1116 "default", 1117 "define", 1118 "desc", 1119 "distinct", 1120 "else", 1121 "end", 1122 "enum", 1123 "escape", 1124 "except", 1125 "exclude", 1126 "exists", 1127 "extract", 1128 "false", 1129 "fetch", 1130 "following", 1131 "for", 1132 "from", 1133 "full", 1134 "group", 1135 "grouping", 1136 "groups", 1137 "hash", 1138 "having", 1139 "if", 1140 "ignore", 1141 "in", 1142 "inner", 1143 "intersect", 1144 "interval", 1145 "into", 1146 "is", 1147 "join", 1148 "lateral", 1149 "left", 1150 "like", 1151 "limit", 1152 "lookup", 1153 "merge", 1154 "natural", 1155 "new", 1156 "no", 1157 "not", 1158 "null", 1159 "nulls", 1160 "of", 1161 "on", 1162 "or", 1163 "order", 1164 "outer", 1165 "over", 1166 "partition", 1167 "preceding", 1168 "proto", 1169 "qualify", 1170 "range", 1171 "recursive", 1172 "respect", 1173 "right", 1174 "rollup", 1175 "rows", 1176 "select", 1177 "set", 1178 "some", 1179 "struct", 1180 "tablesample", 1181 "then", 1182 "to", 1183 "treat", 1184 "true", 1185 "unbounded", 1186 "union", 1187 "unnest", 1188 "using", 1189 "when", 1190 "where", 1191 "window", 1192 "with", 1193 "within", 1194 } 1195 1196 def datetrunc_sql(self, expression: exp.DateTrunc) -> str: 1197 unit = expression.unit 1198 unit_sql = unit.name if unit.is_string else self.sql(unit) 1199 return self.func("DATE_TRUNC", expression.this, unit_sql, expression.args.get("zone")) 1200 1201 def mod_sql(self, expression: exp.Mod) -> str: 1202 this = expression.this 1203 expr = expression.expression 1204 return self.func( 1205 "MOD", 1206 this.unnest() if isinstance(this, exp.Paren) else this, 1207 expr.unnest() if isinstance(expr, exp.Paren) else expr, 1208 ) 1209 1210 def column_parts(self, expression: exp.Column) -> str: 1211 if expression.meta.get("quoted_column"): 1212 # If a column reference is of the form `dataset.table`.name, we need 1213 # to preserve the quoted table path, otherwise the reference breaks 1214 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1215 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1216 return f"{table_path}.{self.sql(expression, 'this')}" 1217 1218 return super().column_parts(expression) 1219 1220 def table_parts(self, expression: exp.Table) -> str: 1221 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1222 # we need to make sure the correct quoting is used in each case. 1223 # 1224 # For example, if there is a CTE x that clashes with a schema name, then the former will 1225 # return the table y in that schema, whereas the latter will return the CTE's y column: 1226 # 1227 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1228 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1229 if expression.meta.get("quoted_table"): 1230 table_parts = ".".join(p.name for p in expression.parts) 1231 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1232 1233 return super().table_parts(expression) 1234 1235 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1236 this = expression.this 1237 if isinstance(this, exp.TsOrDsToDatetime): 1238 func_name = "FORMAT_DATETIME" 1239 elif isinstance(this, exp.TsOrDsToTimestamp): 1240 func_name = "FORMAT_TIMESTAMP" 1241 else: 1242 func_name = "FORMAT_DATE" 1243 1244 time_expr = ( 1245 this 1246 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1247 else expression 1248 ) 1249 return self.func( 1250 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1251 ) 1252 1253 def eq_sql(self, expression: exp.EQ) -> str: 1254 # Operands of = cannot be NULL in BigQuery 1255 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1256 if not isinstance(expression.parent, exp.Update): 1257 return "NULL" 1258 1259 return self.binary(expression, "=") 1260 1261 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1262 parent = expression.parent 1263 1264 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1265 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1266 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1267 return self.func( 1268 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1269 ) 1270 1271 return super().attimezone_sql(expression) 1272 1273 def trycast_sql(self, expression: exp.TryCast) -> str: 1274 return self.cast_sql(expression, safe_prefix="SAFE_") 1275 1276 def bracket_sql(self, expression: exp.Bracket) -> str: 1277 this = expression.this 1278 expressions = expression.expressions 1279 1280 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1281 arg = expressions[0] 1282 if arg.type is None: 1283 from sqlglot.optimizer.annotate_types import annotate_types 1284 1285 arg = annotate_types(arg, dialect=self.dialect) 1286 1287 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1288 # BQ doesn't support bracket syntax with string values for structs 1289 return f"{self.sql(this)}.{arg.name}" 1290 1291 expressions_sql = self.expressions(expression, flat=True) 1292 offset = expression.args.get("offset") 1293 1294 if offset == 0: 1295 expressions_sql = f"OFFSET({expressions_sql})" 1296 elif offset == 1: 1297 expressions_sql = f"ORDINAL({expressions_sql})" 1298 elif offset is not None: 1299 self.unsupported(f"Unsupported array offset: {offset}") 1300 1301 if expression.args.get("safe"): 1302 expressions_sql = f"SAFE_{expressions_sql}" 1303 1304 return f"{self.sql(this)}[{expressions_sql}]" 1305 1306 def in_unnest_op(self, expression: exp.Unnest) -> str: 1307 return self.sql(expression) 1308 1309 def version_sql(self, expression: exp.Version) -> str: 1310 if expression.name == "TIMESTAMP": 1311 expression.set("this", "SYSTEM_TIME") 1312 return super().version_sql(expression) 1313 1314 def contains_sql(self, expression: exp.Contains) -> str: 1315 this = expression.this 1316 expr = expression.expression 1317 1318 if isinstance(this, exp.Lower) and isinstance(expr, exp.Lower): 1319 this = this.this 1320 expr = expr.this 1321 1322 return self.func("CONTAINS_SUBSTR", this, expr) 1323 1324 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1325 this = expression.this 1326 1327 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1328 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1329 # because they aren't literals and so the above syntax is invalid BigQuery. 1330 if isinstance(this, exp.Array): 1331 elem = seq_get(this.expressions, 0) 1332 if not (elem and elem.find(exp.Query)): 1333 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1334 1335 return super().cast_sql(expression, safe_prefix=safe_prefix)
Generator converts a given syntax tree to the corresponding SQL string.
Arguments:
- pretty: Whether to format the produced SQL string. Default: False.
- identify: Determines when an identifier should be quoted. Possible values are: False (default): Never quote, except in cases where it's mandatory by the dialect. True or 'always': Always quote. 'safe': Only quote identifiers that are case insensitive.
- normalize: Whether to normalize identifiers to lowercase. Default: False.
- pad: The pad size in a formatted string. For example, this affects the indentation of a projection in a query, relative to its nesting level. Default: 2.
- indent: The indentation size in a formatted string. For example, this affects the
indentation of subqueries and filters under a
WHERE
clause. Default: 2. - normalize_functions: How to normalize function names. Possible values are: "upper" or True (default): Convert names to uppercase. "lower": Convert names to lowercase. False: Disables function name normalization.
- unsupported_level: Determines the generator's behavior when it encounters unsupported expressions. Default ErrorLevel.WARN.
- max_unsupported: Maximum number of unsupported messages to include in a raised UnsupportedError. This is only relevant if unsupported_level is ErrorLevel.RAISE. Default: 3
- leading_comma: Whether the comma is leading or trailing in select expressions. This is only relevant when generating in pretty mode. Default: False
- max_text_width: The max number of characters in a segment before creating new lines in pretty mode. The default is on the smaller end because the length only represents a segment and not the true line length. Default: 80
- comments: Whether to preserve comments in the output SQL code. Default: True
1210 def column_parts(self, expression: exp.Column) -> str: 1211 if expression.meta.get("quoted_column"): 1212 # If a column reference is of the form `dataset.table`.name, we need 1213 # to preserve the quoted table path, otherwise the reference breaks 1214 table_parts = ".".join(p.name for p in expression.parts[:-1]) 1215 table_path = self.sql(exp.Identifier(this=table_parts, quoted=True)) 1216 return f"{table_path}.{self.sql(expression, 'this')}" 1217 1218 return super().column_parts(expression)
1220 def table_parts(self, expression: exp.Table) -> str: 1221 # Depending on the context, `x.y` may not resolve to the same data source as `x`.`y`, so 1222 # we need to make sure the correct quoting is used in each case. 1223 # 1224 # For example, if there is a CTE x that clashes with a schema name, then the former will 1225 # return the table y in that schema, whereas the latter will return the CTE's y column: 1226 # 1227 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x.y` -> cross join 1228 # - WITH x AS (SELECT [1, 2] AS y) SELECT * FROM x, `x`.`y` -> implicit unnest 1229 if expression.meta.get("quoted_table"): 1230 table_parts = ".".join(p.name for p in expression.parts) 1231 return self.sql(exp.Identifier(this=table_parts, quoted=True)) 1232 1233 return super().table_parts(expression)
1235 def timetostr_sql(self, expression: exp.TimeToStr) -> str: 1236 this = expression.this 1237 if isinstance(this, exp.TsOrDsToDatetime): 1238 func_name = "FORMAT_DATETIME" 1239 elif isinstance(this, exp.TsOrDsToTimestamp): 1240 func_name = "FORMAT_TIMESTAMP" 1241 else: 1242 func_name = "FORMAT_DATE" 1243 1244 time_expr = ( 1245 this 1246 if isinstance(this, (exp.TsOrDsToDatetime, exp.TsOrDsToTimestamp, exp.TsOrDsToDate)) 1247 else expression 1248 ) 1249 return self.func( 1250 func_name, self.format_time(expression), time_expr.this, expression.args.get("zone") 1251 )
1253 def eq_sql(self, expression: exp.EQ) -> str: 1254 # Operands of = cannot be NULL in BigQuery 1255 if isinstance(expression.left, exp.Null) or isinstance(expression.right, exp.Null): 1256 if not isinstance(expression.parent, exp.Update): 1257 return "NULL" 1258 1259 return self.binary(expression, "=")
1261 def attimezone_sql(self, expression: exp.AtTimeZone) -> str: 1262 parent = expression.parent 1263 1264 # BigQuery allows CAST(.. AS {STRING|TIMESTAMP} [FORMAT <fmt> [AT TIME ZONE <tz>]]). 1265 # Only the TIMESTAMP one should use the below conversion, when AT TIME ZONE is included. 1266 if not isinstance(parent, exp.Cast) or not parent.to.is_type("text"): 1267 return self.func( 1268 "TIMESTAMP", self.func("DATETIME", expression.this, expression.args.get("zone")) 1269 ) 1270 1271 return super().attimezone_sql(expression)
1276 def bracket_sql(self, expression: exp.Bracket) -> str: 1277 this = expression.this 1278 expressions = expression.expressions 1279 1280 if len(expressions) == 1 and this and this.is_type(exp.DataType.Type.STRUCT): 1281 arg = expressions[0] 1282 if arg.type is None: 1283 from sqlglot.optimizer.annotate_types import annotate_types 1284 1285 arg = annotate_types(arg, dialect=self.dialect) 1286 1287 if arg.type and arg.type.this in exp.DataType.TEXT_TYPES: 1288 # BQ doesn't support bracket syntax with string values for structs 1289 return f"{self.sql(this)}.{arg.name}" 1290 1291 expressions_sql = self.expressions(expression, flat=True) 1292 offset = expression.args.get("offset") 1293 1294 if offset == 0: 1295 expressions_sql = f"OFFSET({expressions_sql})" 1296 elif offset == 1: 1297 expressions_sql = f"ORDINAL({expressions_sql})" 1298 elif offset is not None: 1299 self.unsupported(f"Unsupported array offset: {offset}") 1300 1301 if expression.args.get("safe"): 1302 expressions_sql = f"SAFE_{expressions_sql}" 1303 1304 return f"{self.sql(this)}[{expressions_sql}]"
1324 def cast_sql(self, expression: exp.Cast, safe_prefix: t.Optional[str] = None) -> str: 1325 this = expression.this 1326 1327 # This ensures that inline type-annotated ARRAY literals like ARRAY<INT64>[1, 2, 3] 1328 # are roundtripped unaffected. The inner check excludes ARRAY(SELECT ...) expressions, 1329 # because they aren't literals and so the above syntax is invalid BigQuery. 1330 if isinstance(this, exp.Array): 1331 elem = seq_get(this.expressions, 0) 1332 if not (elem and elem.find(exp.Query)): 1333 return f"{self.sql(expression, 'to')}{self.sql(this)}" 1334 1335 return super().cast_sql(expression, safe_prefix=safe_prefix)
Inherited Members
- sqlglot.generator.Generator
- Generator
- LOCKING_READS_SUPPORTED
- WRAP_DERIVED_VALUES
- CREATE_FUNCTION_RETURN_AS
- MATCHED_BY_SOURCE
- SINGLE_STRING_INTERVAL
- GROUPINGS_SEP
- INDEX_ON
- QUERY_HINT_SEP
- IS_BOOL_ALLOWED
- DUPLICATE_KEY_UPDATE_WITH_SET
- LIMIT_IS_TOP
- RETURNING_END
- EXTRACT_ALLOWS_QUOTES
- TZ_TO_WITH_TIME_ZONE
- SELECT_KINDS
- VALUES_AS_TABLE
- ALTER_TABLE_INCLUDE_COLUMN_KEYWORD
- AGGREGATE_FILTER_SUPPORTED
- SEMI_ANTI_JOIN_WITH_SIDE
- COMPUTED_COLUMN_WITH_TYPE
- SUPPORTS_TABLE_COPY
- TABLESAMPLE_REQUIRES_PARENS
- TABLESAMPLE_SIZE_IS_ROWS
- TABLESAMPLE_KEYWORDS
- TABLESAMPLE_WITH_METHOD
- TABLESAMPLE_SEED_KEYWORD
- DATA_TYPE_SPECIFIERS_ALLOWED
- ENSURE_BOOLS
- CTE_RECURSIVE_KEYWORD_REQUIRED
- SUPPORTS_SINGLE_ARG_CONCAT
- LAST_DAY_SUPPORTS_DATE_PART
- INSERT_OVERWRITE
- SUPPORTS_SELECT_INTO
- SUPPORTS_UNLOGGED_TABLES
- SUPPORTS_CREATE_TABLE_LIKE
- LIKE_PROPERTY_INSIDE_SCHEMA
- MULTI_ARG_DISTINCT
- JSON_TYPE_REQUIRED_FOR_EXTRACTION
- JSON_PATH_BRACKETED_KEY_SUPPORTED
- SUPPORTS_WINDOW_EXCLUDE
- SET_OP_MODIFIERS
- COPY_PARAMS_ARE_WRAPPED
- COPY_PARAMS_EQ_REQUIRED
- COPY_HAS_INTO_KEYWORD
- STAR_EXCEPT
- QUOTE_JSON_PATH
- PAD_FILL_PATTERN_IS_REQUIRED
- ARRAY_CONCAT_IS_VAR_LEN
- SUPPORTS_CONVERT_TIMEZONE
- SUPPORTS_MEDIAN
- ALTER_SET_WRAPPED
- NORMALIZE_EXTRACT_DATE_PARTS
- PARSE_JSON_NAME
- ARRAY_SIZE_NAME
- ALTER_SET_TYPE
- ARRAY_SIZE_DIM_REQUIRED
- TIME_PART_SINGULARS
- TOKEN_MAPPING
- STRUCT_DELIMITER
- PARAMETER_TOKEN
- EXPRESSION_PRECEDES_PROPERTIES_CREATABLES
- WITH_SEPARATED_COMMENTS
- EXCLUDE_COMMENTS
- UNWRAPPED_INTERVAL_VALUES
- PARAMETERIZABLE_TEXT_TYPES
- EXPRESSIONS_WITHOUT_NESTED_CTES
- RESPECT_IGNORE_NULLS_UNSUPPORTED_EXPRESSIONS
- SENTINEL_LINE_BREAK
- pretty
- identify
- normalize
- pad
- unsupported_level
- max_unsupported
- leading_comma
- max_text_width
- comments
- dialect
- normalize_functions
- unsupported_messages
- generate
- preprocess
- unsupported
- sep
- seg
- sanitize_comment
- maybe_comment
- wrap
- no_identify
- normalize_func
- indent
- sql
- uncache_sql
- cache_sql
- characterset_sql
- column_sql
- columnposition_sql
- columndef_sql
- columnconstraint_sql
- computedcolumnconstraint_sql
- autoincrementcolumnconstraint_sql
- compresscolumnconstraint_sql
- generatedasidentitycolumnconstraint_sql
- generatedasrowcolumnconstraint_sql
- periodforsystemtimeconstraint_sql
- notnullcolumnconstraint_sql
- primarykeycolumnconstraint_sql
- uniquecolumnconstraint_sql
- createable_sql
- create_sql
- sequenceproperties_sql
- clone_sql
- describe_sql
- heredoc_sql
- prepend_ctes
- with_sql
- cte_sql
- tablealias_sql
- bitstring_sql
- hexstring_sql
- bytestring_sql
- unicodestring_sql
- rawstring_sql
- datatypeparam_sql
- datatype_sql
- directory_sql
- delete_sql
- drop_sql
- set_operation
- set_operations
- fetch_sql
- limitoptions_sql
- filter_sql
- hint_sql
- indexparameters_sql
- index_sql
- identifier_sql
- hex_sql
- lowerhex_sql
- inputoutputformat_sql
- national_sql
- partition_sql
- properties_sql
- root_properties
- properties
- with_properties
- locate_properties
- property_name
- property_sql
- likeproperty_sql
- fallbackproperty_sql
- journalproperty_sql
- freespaceproperty_sql
- checksumproperty_sql
- mergeblockratioproperty_sql
- datablocksizeproperty_sql
- blockcompressionproperty_sql
- isolatedloadingproperty_sql
- partitionboundspec_sql
- partitionedofproperty_sql
- lockingproperty_sql
- withdataproperty_sql
- withsystemversioningproperty_sql
- insert_sql
- introducer_sql
- kill_sql
- pseudotype_sql
- objectidentifier_sql
- onconflict_sql
- returning_sql
- rowformatdelimitedproperty_sql
- withtablehint_sql
- indextablehint_sql
- historicaldata_sql
- table_sql
- tablefromrows_sql
- tablesample_sql
- pivot_sql
- tuple_sql
- update_sql
- values_sql
- var_sql
- into_sql
- from_sql
- groupingsets_sql
- rollup_sql
- cube_sql
- group_sql
- having_sql
- connect_sql
- prior_sql
- join_sql
- lambda_sql
- lateral_op
- lateral_sql
- limit_sql
- offset_sql
- setitem_sql
- set_sql
- pragma_sql
- lock_sql
- literal_sql
- escape_str
- loaddata_sql
- null_sql
- boolean_sql
- order_sql
- withfill_sql
- cluster_sql
- distribute_sql
- sort_sql
- ordered_sql
- matchrecognizemeasure_sql
- matchrecognize_sql
- query_modifiers
- options_modifier
- for_modifiers
- queryoption_sql
- offset_limit_modifiers
- after_limit_modifiers
- select_sql
- schema_sql
- schema_columns_sql
- star_sql
- parameter_sql
- sessionparameter_sql
- placeholder_sql
- subquery_sql
- qualify_sql
- unnest_sql
- prewhere_sql
- where_sql
- window_sql
- partition_by_sql
- windowspec_sql
- withingroup_sql
- between_sql
- bracket_offset_expressions
- all_sql
- any_sql
- exists_sql
- case_sql
- constraint_sql
- nextvaluefor_sql
- extract_sql
- trim_sql
- convert_concat_args
- concat_sql
- concatws_sql
- check_sql
- foreignkey_sql
- primarykey_sql
- if_sql
- matchagainst_sql
- jsonkeyvalue_sql
- jsonpath_sql
- json_path_part
- formatjson_sql
- jsonobject_sql
- jsonobjectagg_sql
- jsonarray_sql
- jsonarrayagg_sql
- jsoncolumndef_sql
- jsonschema_sql
- jsontable_sql
- openjsoncolumndef_sql
- openjson_sql
- in_sql
- interval_sql
- return_sql
- reference_sql
- anonymous_sql
- paren_sql
- neg_sql
- not_sql
- alias_sql
- pivotalias_sql
- aliases_sql
- atindex_sql
- fromtimezone_sql
- add_sql
- and_sql
- or_sql
- xor_sql
- connector_sql
- bitwiseand_sql
- bitwiseleftshift_sql
- bitwisenot_sql
- bitwiseor_sql
- bitwiserightshift_sql
- bitwisexor_sql
- currentdate_sql
- collate_sql
- command_sql
- comment_sql
- mergetreettlaction_sql
- mergetreettl_sql
- transaction_sql
- commit_sql
- rollback_sql
- altercolumn_sql
- alterindex_sql
- alterdiststyle_sql
- altersortkey_sql
- alterrename_sql
- renamecolumn_sql
- alterset_sql
- alter_sql
- add_column_sql
- droppartition_sql
- addconstraint_sql
- addpartition_sql
- distinct_sql
- ignorenulls_sql
- respectnulls_sql
- havingmax_sql
- intdiv_sql
- dpipe_sql
- div_sql
- safedivide_sql
- overlaps_sql
- distance_sql
- dot_sql
- propertyeq_sql
- escape_sql
- glob_sql
- gt_sql
- gte_sql
- ilike_sql
- ilikeany_sql
- is_sql
- like_sql
- likeany_sql
- similarto_sql
- lt_sql
- lte_sql
- mul_sql
- neq_sql
- nullsafeeq_sql
- nullsafeneq_sql
- slice_sql
- sub_sql
- jsoncast_sql
- try_sql
- log_sql
- use_sql
- binary
- ceil_floor
- function_fallback_sql
- func
- format_args
- too_wide
- format_time
- expressions
- op_expressions
- naked_property
- tag_sql
- token_sql
- userdefinedfunction_sql
- joinhint_sql
- kwarg_sql
- when_sql
- whens_sql
- merge_sql
- tochar_sql
- tonumber_sql
- dictproperty_sql
- dictrange_sql
- dictsubproperty_sql
- duplicatekeyproperty_sql
- uniquekeyproperty_sql
- distributedbyproperty_sql
- oncluster_sql
- clusteredbyproperty_sql
- anyvalue_sql
- querytransform_sql
- indexconstraintoption_sql
- checkcolumnconstraint_sql
- indexcolumnconstraint_sql
- nvl2_sql
- comprehension_sql
- columnprefix_sql
- opclass_sql
- predict_sql
- forin_sql
- refresh_sql
- toarray_sql
- tsordstotime_sql
- tsordstotimestamp_sql
- tsordstodatetime_sql
- tsordstodate_sql
- unixdate_sql
- lastday_sql
- dateadd_sql
- arrayany_sql
- struct_sql
- partitionrange_sql
- truncatetable_sql
- convert_sql
- copyparameter_sql
- credentials_sql
- copy_sql
- semicolon_sql
- datadeletionproperty_sql
- maskingpolicycolumnconstraint_sql
- gapfill_sql
- scope_resolution
- scoperesolution_sql
- parsejson_sql
- rand_sql
- changes_sql
- pad_sql
- summarize_sql
- explodinggenerateseries_sql
- arrayconcat_sql
- converttimezone_sql
- json_sql
- jsonvalue_sql
- conditionalinsert_sql
- multitableinserts_sql
- oncondition_sql
- jsonextractquote_sql
- jsonexists_sql
- arrayagg_sql
- apply_sql
- grant_sql
- grantprivilege_sql
- grantprincipal_sql
- columns_sql
- overlay_sql
- todouble_sql
- string_sql
- median_sql
- overflowtruncatebehavior_sql
- unixseconds_sql
- arraysize_sql
- attach_sql
- detach_sql
- attachoption_sql
- featuresattime_sql
- watermarkcolumnconstraint_sql
- encodeproperty_sql
- includeproperty_sql
- xmlelement_sql
- xmlkeyvalueoption_sql
- partitionbyrangeproperty_sql
- partitionbyrangepropertydynamic_sql
- unpivotcolumns_sql
- analyzesample_sql
- analyzestatistics_sql
- analyzehistogram_sql
- analyzedelete_sql
- analyzelistchainedrows_sql
- analyzevalidate_sql
- analyze_sql
- xmltable_sql
- xmlnamespace_sql
- export_sql
- declare_sql
- declareitem_sql
- recursivewithsearch_sql
- parameterizedagg_sql
- anonymousaggfunc_sql
- combinedaggfunc_sql
- combinedparameterizedagg_sql
- show_sql
- get_put_sql
- translatecharacters_sql